如何计算每个人物在邮件关系网络中的影响力?
我们用pagerank来查看,各个点的影响力,以及用nexworkx中的graph来展示其关系。
如图ABCD代表着4个网页,其中箭头指向表示,从某网页跳转到另一个网页,那么我们来计算一下这4个网页的影响力
pagerank算法原理就是:一个网页的影响力 = 所有入链集合的页面的加权影响力之和。
Google 的两位创始人都是斯坦福大学的博士生,他们提出的 PageRank 算法受到了论文影响力因子的评价启发。当一篇论文被引用的次数越多,证明这篇论文的影响力越大。正是这个想法解决了当时网页检索质量不高的问题
import matplotlib.pyplot as plt import networkx as nx G = nx.DiGraph() # 有向图之间边的关系 edges = [("A", "B"), ("A", "C"), ("A", "D"), ("B", "A"), ("B", "D"), ("C", "A"), ("D", "B"), ("D", "C")] for edge in edges: G.add_edge(edge[0], edge[1]) pagerank_list = nx.pagerank(G, alpha=1)#alpha为阻尼因子,alpha=0.85表示跳转率为15% print("pagerank 值是:", pagerank_list) nx.set_node_attributes(G,values=pagerank_list,name='pagerank_list') # 绘制圆环图像 positions=nx.circular_layout(G) #绘制反射图像 # positions=nx.spring_layout(G) # 绘制节点 nx.draw_networkx_nodes(G, positions, alpha=0.4) # 绘制边 nx.draw_networkx_edges(G, positions, alpha=0.2) # 绘制节点的 label nx.draw_networkx_labels(G, positions, font_size=10) plt.show()
https://www.cnblogs.com/jpcflyer/p/11180263.html
接下来我们看下如何利用pagerank和graph来展示希拉里的邮件
import pandas as pd import numpy as np import networkx as nx import matplotlib.pyplot as plt from collections import defaultdict emails = pd.read_csv("E:/数据学习网站/PageRank-master/input/Emails.csv") file = pd.read_csv("E:/数据学习网站/PageRank-master/input/Aliases.csv") aliases={} for index,row in file.iterrows(): aliases[row['Alias']]=row['PersonId'] # print(aliases) file = pd.read_csv("E:/数据学习网站/PageRank-master/input/Persons.csv") persons={} for index,row in file.iterrows(): persons[row['Id']]=row['Name'] # 单词间的转换 def unify_name(name): name =str(name).lower() name=name.replace(",","").split("@")[0] if name in aliases.keys(): return persons[aliases[name]] return name def show_graph(graph,layout='spring_layout'): if layout=="circular_layout": position=nx.circular_layout(graph) else: position=nx.spring_layout(graph) nodesize=[x['pagerank']*20000 for v,x in graph.node(data=True)] edgesize=[np.sqrt(e[2]['weight']) for e in graph.edges(data=True)] nx.draw_networkx_nodes(graph,position,node_size=nodesize,alpha=0.4) nx.draw_networkx_edges(graph,position,edge_size=edgesize,alpha=0.2) nx.draw_networkx_labels(graph,position,font_size=10) plt.show() emails.MetadataTo=emails.MetadataTo.apply(unify_name) emails.MetadataFrom=emails.MetadataFrom.apply(unify_name) edges_weights_temp=defaultdict() for row in zip(emails.MetadataTo,emails.MetadataFrom,emails.RawText): temp=(row[0],row[1]) if temp not in edges_weights_temp: edges_weights_temp[temp]=1 else: edges_weights_temp[temp]=edges_weights_temp[temp]+1 edges_weight=[(key[0],key[1],val) for key,val in edges_weights_temp.items()] graph=nx.DiGraph() graph.add_weighted_edges_from(edges_weight) pagerank=nx.pagerank(graph) nx.set_node_attributes(graph,name='pagerank',values=pagerank) show_graph(graph) pagerank_threshold=0.005 small_graph=graph.copy() for n,p_rank in graph.node(data=True): if p_rank["pagerank"]<pagerank_threshold: small_graph.remove_node(n) show_graph(small_graph,"circular_layout")
03-10
608