import networkx as nx # 图数据挖掘
import gensim
from gensim.models import Word2Vec # 自然语言处理 gensim==4.2.0
# 数据分析
import pandas as pd
import numpy as np
import random # 随机数
from tqdm import tqdm # 进度条
# 数据可视化
import matplotlib.pyplot as plt
print(gensim.__version__)
plt.rcParams['font.sans-serif']=['SimHei'] # 用来正常显示中文标签
plt.rcParams['axes.unicode_minus']=False # 用来正常显示负号
df = pd.read_csv("seealsology-data(1).tsv", sep = "\t")
G = nx.from_pandas_edgelist(df, "source", "target", edge_attr=True, create_using=nx.Graph())
def get_randomwalk(node, path_length):
'''
输入起始节点和路径长度,生成随机游走节点序列
'''
random_walk = [node]
for i in range(path_length - 1):
# 汇总邻接节点
temp = list(G.neighbors(node))
temp = list(set(temp) - set(random_walk))
if len(temp) == 0:
break
# 从邻接节点中随机选择下一个节点
random_node = random.choice(temp)
random_walk.append(random_node)
node = random_node
return random_walk
all_nodes = list(G.nodes())
gamma = 10 # 每个节点作为起始点生成随机游走序列个数
walk_length = 5 # 随机游走序列最大长度
random_walks = []
for n in tqdm(all_nodes): # 遍历每个节点
for i in range(gamma): # 每个节点作为起始点生成gamma个随机游走序列
random_walks.append(get_randomwalk(n, walk_length))
# 生成随机游走序列个数
print(len(random_walks))
print(random_walks[1])
model = Word2Vec(vector_size=64, # Embedding维数
window=4, # 窗口宽度
sg=1, # Skip-Gram
hs=0, # 不加分层softmax
negative=10, # 负采样
alpha=0.03, # 初始学习率
min_alpha=0.0007, # 最小学习率
seed=14 # 随机数种子
)
# 用随机游走序列构建词汇表
model.build_vocab(random_walks, progress_per=2)
# 训练(耗时1分钟左右)
model.train(random_walks, total_examples=model.corpus_count, epochs=50, report_delay=1)
# 查看某个节点的Embedding
model.wv.get_vector('random forest').shape
model.wv.get_vector('random forest')
# 找相似词语
model.wv.similar_by_word('decision tree')
X = model.wv.vectors
# 将Embedding用PCA降维到2维
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
embed_2d = pca.fit_transform(X)
plt.figure(figsize=(14,14))
plt.scatter(embed_2d[:, 0], embed_2d[:, 1])
plt.show()
term = 'computer vision'
term_256d = model.wv[term].reshape(1,-1)
term_2d = pca.transform(term_256d)
plt.figure(figsize=(14,14))
plt.scatter(embed_2d[:,0], embed_2d[:,1])
plt.scatter(term_2d[:,0],term_2d[:,1],c='r',s=200)
plt.show()
# 计算PageRank重要度
pagerank = nx.pagerank(G)
# 从高到低排序
node_importance = sorted(pagerank.items(), key=lambda x:x[1], reverse=True)
# 取最高的前n个节点
n = 30
terms_chosen = []
for each in node_importance[:n]:
terms_chosen.append(each[0])
# 手动补充新节点
terms_chosen.extend(['computer vision','deep learning','convolutional neural network','convolution','natural-language processing','attention (machine learning)','support-vector machine','decision tree','random forest','computational imaging','machine vision','cognitive science','neuroscience','psychophysics','brain','visual cortex','visual neuroscience','cognitive model','finite difference','finite difference time domain','finite difference coefficients','finite difference methods for option pricing','iso 128','iso 10303'])
# 输入词条,输出词典中的索引号
term2index = model.wv.key_to_index
# 可视化全部词条和关键词条的二维Embedding
plt.figure(figsize=(14,14))
plt.scatter(embed_2d[:,0], embed_2d[:,1])
for item in terms_chosen:
idx = term2index[item]
plt.scatter(embed_2d[idx,0], embed_2d[idx,1],c='r',s=50)
plt.annotate(item, xy=(embed_2d[idx,0], embed_2d[idx,1]),c='k',fontsize=12)
plt.show()
# 将Embedding用TSNE降维到2维
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, n_iter=1000)
embed_2d = tsne.fit_transform(X)
plt.figure(figsize=(14,14))
plt.scatter(embed_2d[:, 0], embed_2d[:, 1])
plt.show()
plt.figure(figsize=(14,14))
plt.scatter(embed_2d[:,0], embed_2d[:,1])
for item in terms_chosen:
idx = term2index[item]
plt.scatter(embed_2d[idx,0], embed_2d[idx,1],c='r',s=50)
plt.annotate(item, xy=(embed_2d[idx,0], embed_2d[idx,1]),c='k',fontsize=12)
plt.show()