python网络分析-network第一版的封装

本文主要是对我在写实验的时候所用到的networkx进行的一个初步的封装。其实不是很标准,现在再写第二版。先把之前的代码贴上来。主要参考的文档就是networkx的官方文档。
[networkx-reference]

我需要说明一点,下面的代码针对的是无向图

代码

下面这一部分代码是对networkx的初步封装。

  • GraphOperation.py
#-*- coding:utf-8 -*-
import networkx as nx
import matplotlib.pyplot as plt
import traceback

'''
我对networkx 的封装
还是一个图操作-工具类
'''

class GraphOperation:

    #-----------------graph operation-----------------

    # construct a graph - undirected graph if default
    def __init__(self):

        self.graph = nx.Graph()

    def convert_to_directed_graph(self):
        self.graph = nx.DiGraph()

    def convert_to_multi_graph(self):
        self.graph = nx.MultiGraph()

    # only directed graph can do this operation
    def convert_to_undirected_graph(self):
        self.graph = nx.Graph()

    # clear the graph
    def clear_graph(self):
        try:
            self.graph.clear()
        except Exception, e:
            print traceback.print_exc()

    #------------------node operation----------------------------

    # add a node
    def add_node(self, node):
        try:
            self.graph.add_node(node)
        except Exception,e:
            print traceback.print_exc()

    # add a list of nodes
    def add_nodes_by_list(self, node_list):
        try:
            self.graph.add_nodes_from(node_list)
        except Exception,e:
            print traceback.print_exc()


    # remove a node
    def remove_node(self, node):
        try:
            self.graph.remove_node(node)
        except Exception,e:
            print traceback.print_exc()

    # remove a list of nodes
    def remove_nodes_by_list(self, node_list):
        try:
            self.graph.remove_nodes_from(node_list)
        except Exception,e:
            print traceback.print_exc()


    # get number of nodes
    def get_number_of_nodes(self):
        try:
            return self.graph.number_of_nodes()
        except Exception, e:
            print traceback.print_exc()

    # get nodes, return a list of nodes
    def get_nodes(self):
        try:
            return self.graph.nodes()
        except Exception, e:
            print traceback.print_exc()


    # get neighbors of v, return a list of nodes which is the neighbor of v
    def get_neighbors(self, v):
        try:
            return self.graph.neighbors(v)
        except Exception, e:
            print traceback.print_exc()

    #---------------edge operation------------------------------

    # add an edge
    def add_edge(self,u,v):
        try:
            self.graph.add_edge(u,v)
        except Exception,e:
            print traceback.print_exc()

    # add an edge by a tuple
    def add_edge_by_tuple(self,e):
        try:
            self.add_edge(*e) # unpack edge tuple
        except Exception,e:
            print traceback.print_exc()

    # add edges by list which is compromised of tuples, every tuple is an edge
    def add_edges_by_list(self, edge_list):
        try:
            self.graph.add_edges_from(edge_list)
        except Exception,e:
            print traceback.print_exc()


    # remove an edge
    def remove_edge(self,u ,v ):
        try:
            self.graph.remove_edge(u, v)
        except Exception,e:
            print traceback.print_exc()

    # remove an edge by tuple
    def remove_edge_by_tuple(self, e):
        try:
            self.remove_edge(*e)
        except Exception,e:
            print traceback.print_exc()

    # remove edges by list which is compromised of tuples
    def remove_edges_by_list(self, edge_list):
        try:
            self.remove_edges_from(edge_list)
        except Exception, e:
            print traceback.print_exc()


    # get number of edges
    def get_number_of_edges(self):
        try:
            return self.graph.number_of_edges()
        except Exception, e:
            print traceback.print_exc()

    # get edges, return a list of tuple which is a presentation of an edge
    def get_edges(self):
        try:
            return self.graph.edges()
        except Exception, e:
            print traceback.print_exc()


    # add weighted list by a list which is compromised of tuples
    def add_weighted_edge(self, weighted_edge_list):
        try:
            self.graph.add_weighted_edges_from(weighted_edge_list)
        except Exception, e:
            print traceback.print_exc()

    # get weighted edge
    def get_weighted_edge(self):
        try:
            return self.graph.edges(data='weight')
        except Exception, e:
            print traceback.print_exc()

    #---------------degree analysis-------------------------------------------------------------

    # get the degree of all nodes, return a dict<node, degree>.
    # directed graph work well, undirected graph does not test.
    def get_degree(self):
        try:
            return self.graph.degree()
        except Exception, e:
            print traceback.print_exc()

    # get the degree of a node, return an interger
    def get_degree_by_node(self, node_id):
        try:
            return self.graph.degree(node_id)
        except Exception, e:
            print traceback.print_exc()

    # get the degree of a node, but the degree is not viewed as sum of edges
    # instead the degree is viewed as sum of the weight of edges
    # eg: (1,2,0.5),(3,1,0.75) the degree based on weight of node 1 is 0.5+0.75 = 1.25(not 2)
    def get_degree_based_on_weight_by_node(self, node_id):
        try:
            return self.graph.degree(node_id, weight="weight")
        except Exception, e:
            print traceback.print_exc()

    # get sorted degrees, return a list. the item of a list is degree value of a node
    def get_sorted_degrees(self):
        try:
            return sorted(nx.degree(self.graph).values(), reverse=True)
        except Exception, e:
            print traceback.print_exc()



    # get the indegree of all nodes.
    def get_in_degree(self):
        try:
            return self.graph.in_degree()
        except Exception, e:
            print traceback.print_exc()

    # get the indegree of a node
    def get_in_degree_by_node(self, node_id):
        try:
            return self.graph.in_degree(node_id)
        except Exception, e:
            print traceback.print_exc()

    def get_in_degree_based_on_weight_by_node(self, node_id):
        try:
            return self.graph.in_degree(node_id, weight = "weight")
        except Exception, e:
            print traceback.print_exc()

    # get the outdegree of all nodes
    def get_out_degree(self):
        try:
            return self.graph.out_degree()
        except Exception, e:
            print traceback.print_exc()

    # get the outdegree of a node
    def get_out_degree_by_node(self, node_id):
        try:
            return self.graph.out_degree(node_id)
        except Exception, e:
            print traceback.print_exc()

    def get_out_degree_based_on_weight_by_node(self, node_id):
        try:
            return self.graph.out_degree(node_id, weight="weight")
        except Exception, e:
            print traceback.print_exc()



    # ----------component analysis-----------------

    # get connected components - return a list of set which is a component
    def get_connected_components(self):
        try:
            return nx.connected_components(self.graph)
        except Exception, e:
            print traceback.print_exc()

    # ----------drawing graph-----------------------
    def draw_graph(self,title):
        try:

            plt.title(title)
            nx.draw(self.graph)

            plt.show(title)
        except Exception, e:
            print traceback.print_exc()

    def draw_network(self):
        try:
            nx.draw_networkx(self.graph, nx.spring_layout)
            plt.show()
        except Exception,e:
            print traceback.print_exc()


    def draw_graph_random_layout(self):
        try:
            nx.draw_random(self.graph)
            plt.show()
        except Exception,e:
            print traceback.print_exc()


    def draw_graph_spring_layout(self):
        try:
            nx.draw_spring(self.graph)
            plt.show()
        except Exception,e:
            print traceback.print_exc()


    # ---------- Graph methods--------------------------

    # return a list of the frequency of each degree value
    # 这个函数我说明一下,之前的degree函数返回的是每个节点的度,但是度分布则是统计了度为某个值的个数。下面的函数
    # 很好的完成了这个任务,就是统计了度分布,当然最后一项是还有值的情形
    def get_degree_distribution(self):
        try:
            return nx.degree_histogram(self.graph)
        except Exception,e:
            print traceback.print_exc()

    def get_density(self):
        try:
            return nx.density(self.graph)
        except Exception,e:
            print traceback.print_exc()

    # get the transitivity - global clustering coefficient
    def get_transitivity(self):
        try:
            return nx.transitivity(self.graph)
        except Exception,e:
            print traceback.print_exc()

    def get_averate_clustering(self):
        try:
            return nx.average_clustering(self.graph)
        except Exception,e:
            print traceback.print_exc()

    def get_average_shortest_path_length(self):
        try:
            return nx.average_shortest_path_length(self.graph)
        except Exception,e:
            print traceback.print_exc()


    def write_to_pajek(self, pajek_net_path):
        try:
            nx.write_pajek(self.graph, pajek_net_path)
        except Exception,e:
            print traceback.print_exc()

    #--------------------------------------------------------
    #--------------centrality--------------------------------
    #--------------------------------------------------------

    # The degree centrality for a node v is the fraction of nodes it is connected to.
    def get_degree_centrality(self):
        try:
            return nx.degree_centrality(self.graph)
        except Exception,e:
            print traceback.print_exc()

    # Betweenness centrality of a node v is the sum of the fraction of all-pairs shortest paths that pass through v
    def get_betweenness_centrality(self):
        try:
            return nx.betweenness_centrality(self.graph)
        except Exception,e:
            print traceback.print_exc()

    # The load centrality of a node is the fraction of all shortest paths that pass through that node.
    def get_load_centrality(self):
        try:
            return nx.load_centrality(self.graph)
        except Exception,e:
            print traceback.print_exc()

    # Eigenvector centrality computes the centrality for a node based on the centrality of its neighbors.
    def get_eigenvector_centrality(self):
        try:
            return nx.eigenvector_centrality(self.graph)
        except Exception,e:
            print traceback.print_exc()

MyGraph.py

#-*- coding:utf-8 -*-
from GraphOperation import*

'''
基于我自己的工具类MyGraph
写一个图的操作类,实现图的各种操作
'''

class MyGraph:

    # 构造函数 - 主要是为了定义成员变量
    def __init__(self):
        self.my_graph = GraphOperation()
        self.map_name_to_number = dict()
        self.map_number_to_name = dict()
        self.output_path = ""

        self.clique_list = [] # for draw_community

        self.max_connected_component_subgraph = None

    # 构造图 - 初始化两个mapper,并且构造图
    def construct_graph(self, clique_list):
        try:

            # convert the name to number and store the relation in map_name_to_number
            number = 1
            new_clique_list = []

            for clique in clique_list:
                new_clique = []
                for u in clique:
                    if u in self.map_name_to_number:
                        new_clique.append(self.map_name_to_number[u])
                    else:
                        self.map_name_to_number[u] = number
                        number += 1
                        new_clique.append(self.map_name_to_number[u])
                new_clique_list.append(new_clique)

            # convert the number to name and store the relation in map_number_to_name
            self.map_number_to_name = dict()
            for name, number in self.map_name_to_number.items():
                self.map_number_to_name[number] = name

            self.clique_list = new_clique_list
            # construct graph based on the new_clique_list
            for clique in new_clique_list:
                # add all edges

                for u in clique:
                    # add a single node in case there exists node itself
                    self.my_graph.add_node(u)

                    for v in clique:
                        if (u == v):
                            continue
                        e = (u, v)
                        self.my_graph.add_edge_by_tuple(e)

            print "[INFO]: construct_graph is finished!"
        except Exception,e:
            print traceback.print_exc()

    # 加入一条边
    def add_edge(self, u, v):
        try:

            self.my_graph.add_edge(u, v)

        except Exception,e:
            print traceback.print_exc()

    # 获得所有边
    def get_all_edges(self):
        try:

            return self.my_graph.get_edges()

        except Exception,e:
            print traceback.print_exc()

    # 设置网络特征的输出路径
    def set_output_path(self, output_path):
        try:
            self.output_path = output_path
            print "[INFO]: set_output_path is finished!"
        except Exception,e:
            print traceback.print_exc()

    # 获得最大联通分量
    # 由于必须是在整个图生成之后,才能获得最大联通分量
    # 所以这个方法必须写在封装的第二层,第一层的类写的不够好。不能直接封装
    def set_max_connected_component_subgraph(self):
        try:
            self.max_connected_component_subgraph = max(nx.connected_component_subgraphs(self.my_graph.graph), key=len)
            print "[INFO]: set_max_connected_component_subgraph is finished!"
        except Exception,e:
            print traceback.print_exc()

    # 返回的是原生的nx.Graph()
    def get_max_connected_component_subgraph(self):
        try:
            return self.max_connected_component_subgraph
        except Exception,e:
            print traceback.print_exc()
    #-----------------------------------------------------------------------
    #-----------------------draw the network--------------------------------
    #-----------------------------------------------------------------------



    # 按照不同的社团进行绘图 - 不同社团具有不同的颜色
    # 逻辑是 不同的社团分别加入进去,然后配置颜色,绘图
    # 因为少了一层封装,所以掉用的时候只能按照最底层的凡是去调用,这样其实不好。
    # 为此,还增加了成员变量,保存clique_list
    def draw_community(self):
        try:
            # 初始信息
            #pos = nx.spring_layout(self.my_graph.graph)
            pos = nx.spring_layout(self.my_graph.graph)
            node_size_ = 100
            color_list = ["red", "yellow", "blue", "green", "pink", "orange", "purple"]
            #color_list = ["red", "yello", "blue", "green"]
            color_list_len = len(color_list)

            # add node and edges
            for i, node_list in enumerate(self.clique_list):
                edge_list = self.get_edges_for_community(node_list)

                # 以下两个函数参数太多,先暂时不直接封装
                #nx.draw_networkx_nodes(self.my_graph.graph, pos, node_list, node_size=node_size_, node_color=color_list[i%color_list_len])
                nx.draw_networkx_nodes(self.my_graph.graph, pos, node_list, node_size=node_size_, node_color=color_list[i], label="hello")
                nx.draw_networkx_edges(self.my_graph.graph, pos, edge_list)

            #title = "Collaboration Network"
            title = "people relation by train"
            plt.title(title)
            plt.show()

            print "[INFO]: draw_community is finished!"
        except Exception,e:
            print traceback.print_exc()

    def get_edges_for_community(self, node_list):
        try:
            edge_list = []
            for u in node_list:
                for v in node_list:
                    if u == v:
                        continue
                    else:
                        edge_list.append((u,v))
            return edge_list
        except Exception,e:
            print traceback.print_exc()

    # 基本画图
    def draw_graph(self,title):
        try:
            self.my_graph.draw_graph(title)
            print "[INFO]: draw_graph is finished!"
        except Exception,e:
            print traceback.print_exc()

    def draw_network(self):
        try:
            self.draw_network()
        except Exception,e:
            print traceback.print_exc()

    def draw_graph_random_layout(self):
        try:
            self.my_graph.draw_graph_random()
        except Exception,e:
            print traceback.print_exc()

    def draw_graph_spring_layout(self):
        try:
            self.my_graph.draw_graph_spring_layout()
            print "[INFO]: draw_graph is finished!"
        except Exception,e:
            print traceback.print_exc()

    #-----------------------------------------------------------------------
    #-----------------------network analysis--------------------------------
    #-----------------------------------------------------------------------


    # 计算节点数
    def cal_num_of_nodes(self):
        try:
            num_nodes = self.my_graph.get_number_of_nodes()
            file_path = self.output_path+"number_of_nodes.txt"

            outfile = open(file_path, "w")
            outfile.write(str(num_nodes) + '\n')
            outfile.close()
            print "[INFO]: cal_num_of_nodes is finished!"
        except Exception,e:
            print traceback.print_exc()

    # 计算边数
    def cal_num_of_edges(self):
        try:
            num_edges = self.my_graph.get_number_of_edges()
            file_path = self.output_path + "number_of_edges.txt"

            outfile = open(file_path, "w")
            outfile.write(str(num_edges) + '\n')
            outfile.close()
            print "[INFO]: cal_num_of_edges is finished!"
        except Exception, e:
            print traceback.print_exc()

    # 计算度分布
    def cal_degree_distribution(self):
        try:

            degree_distribution_list = self.my_graph.get_degree_distribution()
            file_path = self.output_path + "degree_distribution.txt"

            outfile = open(file_path, "w")
            for item in degree_distribution_list:
                line = str(item) + '\n'
                outfile.write(line)
            outfile.close()
            print "[INFO]: cal_degree_distribution is finished!"
        except Exception, e:
            print traceback.print_exc()

    # 计算网络密度
    def cal_density(self):
        try:
            density = self.my_graph.get_density()
            file_path = self.output_path + "graph_density.txt"

            outfile = open(file_path, "w")
            outfile.write(str(density) + '\n')
            outfile.close()
            print "[INFO]: cal_density is finished!"
        except Exception, e:
            print traceback.print_exc()

    # 计算聚集系数
    def cal_transitivity(self):
        try:
            transitivity = self.my_graph.get_transitivity()
            file_path = self.output_path + "transitivity.txt"

            outfile = open(file_path, "w")
            outfile.write(str(transitivity) + '\n')
            outfile.close()
            print "[INFO]: cal_transitivity is finished!"
        except Exception, e:
            print traceback.print_exc()

    def cal_average_clustering(self):
        try:
            average_clustering = self.my_graph.get_averate_clustering()
            file_path = self.output_path + "average_clustering.txt"

            outfile = open(file_path, "w")
            outfile.write(str(average_clustering) + '\n')
            outfile.close()
            print "[INFO]: cal_average_clustering is finished!"
        except Exception,e:
            print traceback.print_exc()

    # 计算平均距离
    def cal_average_shortest_path_length(self):
        try:
            aver_shortest_path = self.my_graph.get_average_shortest_path_length()
            file_path = self.output_path + "average_shortest_path_length.txt"

            outfile = open(file_path, "w")
            outfile.write(str(aver_shortest_path) + '\n')
            outfile.close()
            print "[INFO]: cal_average_shortest_path_length is finished!"
        except Exception, e:
            print traceback.print_exc()

    # 写入pajek格式文件
    def write_to_pajek_net(self):
        try:

            output_path = self.output_path + "graph_of_author_relation.net"

            # write to net file
            outfile = open(output_path, "w")

            nodes_num = self.my_graph.get_number_of_nodes()
            edges_num = self.my_graph.get_number_of_edges()
            first_line_of_node = "*Vertices " + str(nodes_num) + '\n'
            first_line_of_edge = "*Edges " + str(edges_num) + '\n'

            outfile.write(first_line_of_node)
            nodes_list = self.my_graph.get_nodes()
            for node in nodes_list:
                line = ""
                line += str(node) + ' ' + "\"" + str(self.map_number_name[node]) + "\"" + '\n'
                outfile.write(line)

            outfile.write(first_line_of_edge)
            edges_list = self.my_graph.get_edges()
            for edge in edges_list:
                line = ""
                line += str(edge[0]) + ' ' + str(edge[1]) + '\n'
                outfile.write(line)

            outfile.close()
            print "[INFO]: write_to_pajek_net is finished!"
        except Exception, e:
            print traceback.print_exc()

    def write_to_pajek_net1(self):
        try:
            pajek_net_path = self.output_path + "graph_of_author_relation.net"
            self.my_graph.write_to_pajek(pajek_net_path)

            print "[INFO]: write_to_pajek_net1 is finished!"
        except Exception, e:
            print traceback.print_exc()

    #--------------------------------------------------------
    #--------------centrality--------------------------------
    #--------------------------------------------------------
    def get_degree_centrality(self):
        try:
            return self.my_graph.get_degree_centrality()

            print "[INFO]: get_degree_centrality is finished!"
        except Exception,e:
            print traceback.print_exc()

    def get_betweenness_centrality(self):
        try:
            return self.my_graph.get_betweenness_centrality()

            print "[INFO]: get_betweenness_centrality is finished!"
        except Exception, e:
            print traceback.print_exc()

    def get_load_centrality(self):
        try:
            return self.my_graph.get_load_centrality()

            print "[INFO]: get_load_centrality is finished!"
        except Exception, e:
            print traceback.print_exc()

    def get_eigenvector_centrality(self):
        try:
            return self.my_graph.get_eigenvector_centrality()

            print "[INFO]: get_eigenvector_centrality is finished!"
        except Exception, e:
            print traceback.print_exc()

    # --------------------------------------------------------
    # --------------component--------------------------------
    # --------------------------------------------------------
    def draw_max_connected_component_subgraph(self):
        try:
            nx.draw_networkx(self.get_max_connected_component_subgraph(),with_labels = False)
            title = "Max connected subgraph of Collaboration Network"
            plt.title(title)
            plt.show()

            print "[INFO]: draw_max_connected_component_subgraph is finished!"
        except Exception, e:
            print traceback.print_exc()

    def get_average_shortest_path_length_in_max_connected_component_subgraph(self):
        try:

            res = nx.average_shortest_path_length(self.get_max_connected_component_subgraph())
            print "[INFO]: draw_max_connected_component_subgraph is finished!"
            return res
        except Exception, e:
            print traceback.print_exc()

    def cal_average_shortest_path_length_in_max_connected_component_subgraph(self):
        try:
            aver_shortest_path = self.get_average_shortest_path_length_in_max_connected_component_subgraph()
            file_path = self.output_path + "average_shortest_path_length_in_max_connected_subgraph.txt"

            outfile = open(file_path, "w")
            outfile.write(str(aver_shortest_path) + '\n')
            outfile.close()
            print "[INFO]: cal_average_shortest_path_length_in_max_connected_component_subgraph is finished!"
        except Exception, e:
            print traceback.print_exc()
#----------------------------------------------------------------------------
下面这一部分代码就不针对networkx了,主要是xml的封装类,以及 测试部分的代码
- XmlParser

#-*- coding:utf-8
import xml.etree.ElementTree as et
import traceback

'''
基于XML的数据提取以及分析
其实我只可以负责数据提取
但是毕竟是同一个XML,所以把数据分析写进来我认为也是合理的

'''

class XmlParser:
    def __init__(self, xml_path, stop_words_path):
        self.stop_words_path = stop_words_path

        tree = et.parse(xml_path)
        self.root = tree.getroot()

    # 1-pubmed 获取文章作者
    def get_article_author(self):
        try:

            res_list = []
            for pubmed_article in self.root:
                try:
                    #print "---------------------------------------------------"
                    medline_citation = pubmed_article.findall("MedlineCitation")[0]
                    article = medline_citation.findall("Article")[0]
                    author_list = article.findall("AuthorList")[0]
                    author_list = author_list.findall("Author")

                    current_authour_list = []
                    for author in author_list:
                        try:
                            last_name = author.findall("LastName")[0]
                            initials = author.findall("Initials")[0]
                            name = str(last_name.text) + ' ' + str(initials.text)
                            current_authour_list.append(name)
                            #print name
                        except:
                            continue

                    res_list.append(current_authour_list)
                except:
                    continue
            return res_list
        except Exception, e:
            print traceback.print_exc()

    # 1-1 PMC 获取文章作者
    def get_article_author1(self):
        try:

            res_list = []
            for article in self.root:
                try:
                    author_list = []
                    #print pubmed_article
                    #print "---------------------------------------------------"
                    front = article.findall("front")[0]
                    article_meta = front.findall("article-meta")[0]
                    contrib_group = article_meta.findall("contrib-group")[0]

                    contrib_list = contrib_group.findall("contrib")

                    for contrib in contrib_list:
                        name = contrib.findall("name")[0]

                        surname = name.findall("surname")[0]
                        given_name = name.findall("given-names")[0]

                        final_name = ""
                        final_name += str(given_name.text) + " " + str(surname.text)

                        author_list.append(final_name)
                        #print final_name

                    res_list.append(author_list)

                except:
                    continue
            return res_list
        except Exception, e:
            print traceback.print_exc()


    # 2_获得文章标题
    def get_article_title(self, root):
        try:
            article_title_list = []
            for pubmed_article in root:
                try:
                    medline_citation = pubmed_article.findall("MedlineCitation")[0]
                    article = medline_citation.findall("Article")[0]
                    article_title = article.findall("ArticleTitle")[0]

                    article_title = str(article_title.text)
                    #print article_title
                    article_title_list.append(article_title)

                except:
                    continue
            return article_title_list
        except Exception,e:
            print traceback.print_exc()

    # 3_获取年份
    def get_article_year(self, root):
        try:
            article_year_list = []
            cnt = 0
            for pubmed_article in root:
                try:
                    medline_citation = pubmed_article.findall("MedlineCitation")[0]
                    article = medline_citation.findall("Article")[0]
                    article_journal = article.findall("Journal")[0]
                    article_journal_issue = article_journal.findall("JournalIssue")[0]
                    pub_date = article_journal_issue.findall("PubDate")[0]
                    year = pub_date.findall("Year")[0]

                    year = str(year.text)
                    article_year_list.append(year)

                except:
                    continue
            return article_year_list
        except Exception, e:
            print traceback.print_exc()

    # 4_获取出版社名称
    def get_article_journal_title(self, root):
        try:
            journal_title_list = []
            for pubmed_article in root:
                try:
                    medline_citation = pubmed_article.findall("MedlineCitation")[0]
                    article = medline_citation.findall("Article")[0]
                    article_journal = article.findall("Journal")[0]
                    article_journal_title = article_journal.findall("Title")[0]
                    journal_title = str(article_journal_title.text)

                    journal_title_list.append(journal_title)

                except:
                    continue
            return journal_title_list
        except Exception, e:
            print traceback.print_exc()

    # 5_pubmed获取文章摘要
    def get_article_abstract(self, root):
        try:
            article_abstract_list = []
            cnt = 0
            for pubmed_article in root:
                try:
                    medline_citation = pubmed_article.findall("MedlineCitation")[0]
                    article = medline_citation.findall("Article")[0]
                    article_abstract = article.findall("Abstract")[0]
                    article_abstract_text = article_abstract.findall("AbstractText")[0]

                    # 考虑有些文章不存在摘要的情形
                    if article_abstract_text is not None :
                        cnt += 1
                        abstract = str(article_abstract_text.text)
                        #print cnt, " ", abstract

                        article_abstract_list.append(abstract)

                except:
                    continue
            return article_abstract_list
        except Exception, e:
            print traceback.print_exc()

    # 5-1_pmc_获取文章作者
    def get_article_abstract1(self):
        try:

            res_list = []
            for article in self.root:
                try:
                    author_list = []
                    # print pubmed_article
                    # print "---------------------------------------------------"
                    front = article.findall("front")[0]
                    article_meta = front.findall("article-meta")[0]
                    abstract = article_meta.findall("abstract")[0]

                    abstract_p = abstract.findall("p")[0]
                    res_list.append(abstract_p.text)

                except:
                    continue
            return res_list
        except Exception, e:
            print traceback.print_exc()

    # 6_获取出版社名称 - (名字,位置)
    def get_article_journal_info(self, root):
        try:

            # journal_country_list = []
            # journal_name_list = []

            journal_info_list = []
            for pubmed_article in root:
                try:
                    medline_citation = pubmed_article.findall("MedlineCitation")[0]
                    journal_info = medline_citation.findall("MedlineJournalInfo")[0]

                    journal_country = str(journal_info.findall("Country")[0].text)
                    journal_name = str(journal_info.findall("MedlineTA")[0].text)

                    journal_info_list.append(journal_name + ',' + journal_country)

                except:
                    continue
            return journal_info_list

        except Exception, e:

            print traceback.print_exc()

#---------------------------------------------------------#
#                     计算统计特征                          -#
#----------------------------------------------------------#

    # 7_计算每年所发文章数
    def cal_num_of_article_in_each_year(self, write_path):

        try:
            year_list = self.get_article_year(self.root)

            counter = dict()

            #total = len(year_list)
            #print "TOTAL articles: ", total
            for y in year_list:
                if y in counter :
                    counter[y] += 1
                else:
                    counter[y] = 1

            pairs = list(counter.items())
            pairs.sort(reverse=True)


            outfile = open(write_path, "w")
            for pair in pairs:

                line = str(pair[0]) + "\t" + str(pair[1])
                outfile.write(line +'\n')

            outfile.close()

        except Exception, e:
            print traceback.print_exc()

    # 8_pubmed计算文章标题中词频
    def cal_word_occurence_in_article_title(self,output_path):
        try:
            article_list = self.get_article_title(self.root)

            stop_words_list = self.get_stop_words(self.stop_words_path)
            stop_words_list.append(' ')
            stop_words_list.append('')  # 这个要占很大的地方

            word_counter = dict()

            for article in article_list:

                try:
                    # 预处理
                    line = ""
                    for ch in article:
                        if ch.isalpha():
                            line += ch
                        else:
                            line += ' '

                    article = line
                    article = article.split(' ')

                    for word in article:
                        word = word.lower()
                        if word in stop_words_list:
                            continue

                        if word in word_counter:
                            word_counter[word] += 1
                        else:
                            word_counter[word] = 1

                except:
                    continue

            pairs = list(word_counter.items())
            items = [(count,word) for (word,count) in pairs]
            items.sort(reverse=True)

            write_path = output_path + "word_occurence_in_article_title.txt"
            outfile = open(write_path,"w")

            final_str = ""
            final_freq = ""
            cnt = 0

            for item in items:
                line =  str(item[1]) + "\t" + str(item[0])
                outfile.write(line +'\n')

                if cnt < 10:
                    if cnt == 0:
                        final_str = "'" + item[1] + "'" + final_str
                        final_freq = "'" + str(item[0]) + "'" + final_freq
                    else:
                        final_str = "'" + item[1] + "'" + ',' + final_str
                        final_freq = "'" + str(item[0]) + "'" + ',' + final_freq

                cnt += 1

            final_str = '[' + final_str + ']'
            final_freq = '[' + final_freq + ']'
            outfile.write(final_str + '\n')
            outfile.write(final_freq + '\n')

            outfile.close()

        except Exception, e:
            print traceback.print_exc()

    # 9_pubmed计算文章摘要中词频
    def cal_word_occurence_in_article_abstract(self, output_path):
        try:
            abstract_list = self.get_article_abstract(self.root)

            stop_words_list = self.get_stop_words(self.stop_words_path)
            stop_words_list.append(' ')
            stop_words_list.append('')  # 这个要占很大的地方

            word_counter = dict()

            for abstract in abstract_list:

                try:

                    # 预处理
                    line = ""
                    for ch in abstract:
                        if ch.isalpha():
                            line += ch
                        else:
                            line += ' '
                    abstract = line
                    abstract = abstract.split(' ')


                    for word in abstract:
                        word = word.lower()
                        if word in stop_words_list:
                            continue

                        if word in word_counter:
                            word_counter[word] += 1
                        else:
                            word_counter[word] = 1

                except:
                    continue

            pairs = list(word_counter.items())
            items = [(count, word) for (word, count) in pairs]
            items.sort(reverse=True)

            write_path = output_path + "word_occurence_in_article_abstract.txt"
            outfile = open(write_path, "w")

            final_str = ""
            final_freq = ""
            cnt = 0

            for item in items:
                line = str(item[1]) + "\t" + str(item[0])
                outfile.write(line + '\n')

                if cnt < 10:
                    if cnt == 0:
                        final_str = "'" + item[1] + "'" + final_str
                        final_freq = "'" + str(item[0]) + "'"+ final_freq
                    else:
                        final_str = "'"+item[1]+"'" + ',' + final_str
                        final_freq = "'" + str(item[0]) + "'" + ',' + final_freq

                cnt += 1

            final_str = '[' + final_str + ']'
            final_freq = '[' + final_freq + ']'
            outfile.write(final_str + '\n')
            outfile.write(final_freq + '\n')

            outfile.close()

        except Exception, e:
            print traceback.print_exc()

    # 9_1_pmc计算文章摘要中词频
    def cal_word_occurence_in_article_abstract1(self, write_path):
        try:
            abstract_list = self.get_article_abstract1()

            stop_words_list = self.get_stop_words(self.stop_words_path)
            stop_words_list.append(' ')
            stop_words_list.append('') # 这个要占很大的地方

            word_counter = dict()

            for abstract in abstract_list:

                try:
                    # 预处理
                    line = ""
                    for ch in abstract:
                        if ch.isalpha():
                            line += ch
                        else:
                            line += ' '
                    abstract = line
                    abstract = abstract.split(' ')

                    for word in abstract:
                        word = word.lower()
                        if word in stop_words_list:
                            continue


                        if word in word_counter:
                            word_counter[word] += 1
                        else:
                            word_counter[word] = 1
                except:
                    continue

            pairs = list(word_counter.items())
            items = [(count, word) for (word, count) in pairs]
            items.sort(reverse=True)

            #for item in items:
            #    print item[0], '\t', item[1]


            outfile = open(write_path, "w")
            for item in items:
                try:
                    line = ""
                    line = str(item[1]) + '\t' + str(item[0])
                    outfile.write(line+'\n')
                except Exception as ex:
                    print ex
            outfile.close()

        except Exception, e:
            print traceback.print_exc()

    # 10_计算期刊的名字以及其地理位置的出现次数
    def cal_journal_name_and_country_ouucrence(self, country_path, name_path):
        try:

            name_counter = dict()
            country_counter = dict()

            journal_info_list = self.get_article_journal_info(self.root)
            for item in journal_info_list:

                item = item.split(',')
                journal_name = item[0]
                journal_country = item[1]

                if journal_name in name_counter:
                    name_counter[journal_name] += 1
                else:
                    name_counter[journal_name] = 1

                if journal_country in country_counter:
                    country_counter[journal_country] += 1
                else:
                    country_counter[journal_country] = 1

            pairs = list(name_counter.items())
            reverse_pairs = [ (count,name) for (name,count) in pairs ]
            reverse_pairs.sort(reverse=True)

            outfile = open(name_path, "w")
            for item in reverse_pairs:

                name = str(item[1])
                count = str(item[0])

                line = ""
                line += name
                line += '\t'
                line += count

                outfile.write(line + '\n')

            outfile.close()

            pairs = list(country_counter.items())
            reverse_pairs = [(count, country) for (country, count) in pairs]
            reverse_pairs.sort(reverse=True)

            outfile = open(country_path, "w")
            for item in reverse_pairs:
                name = str(item[1])
                count = str(item[0])

                line = ""
                line += name
                line += '\t'
                line += count

                outfile.write(line + '\n')

            outfile.close()


        except Exception, e:
            print traceback.print_exc()

    # 11_计算发布量前10的论文,在不同区的数量
    def cal_num_in_diff_area(self, input_path, out_path):
        try:

            area_counter = {}

            cnt = 0
            infile = open(input_path, "r")
            for line in infile:
                cnt += 1
                if cnt == 1:
                    continue

                line = line.rstrip('\n').split(' ')

                num = int(line[1])
                area = line[3]

                if area in area_counter:
                    area_counter[area] += num
                else:
                    area_counter[area] = num
            infile.close()

            outfile = open(out_path, "w")
            for area in area_counter:
                line = ""
                line += str(area)
                line += " "
                line += str(area_counter[area])
                outfile.write(line + '\n')
            outfile.close()

        except Exception, e:
            print traceback.print_exc()

    # 12_计算影响因子
    def cal_aver_if_factor(self, input_path):
        try:

            cnt = 0
            infile = open(input_path, "r")

            total_num = 0
            total_factor = 0.0

            for line in infile:
                cnt += 1
                if cnt == 1:
                    continue

                line = line.rstrip('\n').split(' ')
                num = int(line[1])
                factor = float(line[2])

                total_num += num
                total_factor += factor * num


            infile.close()

            print total_factor / total_num

        except Exception, e:
            print traceback.print_exc()

    # 13_获取停用词
    def get_stop_words(self, stop_words_path):
        result_list = []

        infile = open(stop_words_path, "r")
        for line in infile:
            line = line.rstrip('\n')
            result_list.append(line)
        infile.close()

        return result_list

    # 14_测试函数
    def test(self):
        journal_info_list = self.get_article_journal_info(self.root)
        print len(journal_info_list)
        for aa in journal_info_list:
            print aa
main.py


#-*- coding:utf-8 -*-
from XmlParser import*
from MyGraph import*

STOP_WORDS_PATH = "../file/stop_words.txt"

XML_PATH1 = "../data/PUBMED/LANCET/2006/lancet_2006_1570.xml"
#XML_PATH2 = "../data/PUBMED/LANCET/2009/lancet_2009_1516.xml"
#OUTPUT_PATH1 = "../output/network_analysis/PUBMED/LANCET/2006/"
#OUTPUT_PATH2 = "../output/network_analysis/PUBMED/LANCET/2009/"
OUTPUT_PATH3 = "../output/src_output/edge.txt"

INPUT_PATH = "../data/src_input/citation.csv"
OUTPUT_PATH = "../output/src_output/"

# @xml_parser_obj:xml解析后的对象
# @OUTPUT_PATH:统计分析之后的输出路径
def statical_analysis( xml_parser_obj, OUTPUT_PATH ):
    try:
        xml_parser_obj.cal_word_occurence_in_article_abstract(OUTPUT_PATH)
        xml_parser_obj.cal_word_occurence_in_article_title(OUTPUT_PATH)

        print "[INFO]: statical_analysis is finished!"
    except Exception,e:
        print traceback.print_exc()

# @xml_parser_obj:xml解析后的对象
# @OUTPUT_PATH: 网络静态分析之后的输出路径
def author_collaboration_network_analysis( xml_parser_obj, OUTPUT_PATH ):
    try:

        # get the author clique list
        author_clique_list = xml_parser_obj.get_article_author()

        # construct the graph based on the author clique list
        graph = MyGraph()
        graph.construct_graph(author_clique_list)
        graph.set_output_path(OUTPUT_PATH)


        # calculate the statistics
        graph.cal_num_of_nodes()
        graph.cal_num_of_edges()

        graph.cal_degree_distribution()
        graph.cal_density()

        # the colloboration network is usually not connected
        #graph.cal_average_shortest_path_length()
        graph.cal_average_clustering()

        graph.write_to_pajek_net1()

        # 这个函数并不是真的画社团 只是把不同clique画出来而已 画的是整个的图
        graph.draw_community()

        graph.set_max_connected_component_subgraph()
        graph.draw_max_connected_component_subgraph()
        graph.cal_average_shortest_path_length_in_max_connected_component_subgraph()

        #graph.draw_graph()
        #graph.draw_graph_spring_layout()
        #graph.draw_graph_random()


        print "[INFO]: author_collaboration_network_analysis is finished!"
    except Exception,e:
        print traceback.print_exc()

def author_collaboration_network_analysis1( xml_parser_obj1, xml_parser_obj2, OUTPUT_PATH ):
    try:

        # get the author clique list
        author_clique_list = xml_parser_obj1.get_article_author()
        author_clique_list.extend(xml_parser_obj2.get_article_author())

        # construct the graph based on the author clique list
        graph = MyGraph()
        graph.construct_graph(author_clique_list)
        graph.set_output_path(OUTPUT_PATH)

        # calculate the statistics
        graph.cal_num_of_nodes()
        graph.cal_num_of_edges()

        graph.cal_degree_distribution()
        graph.cal_density()

        graph.cal_average_shortest_path_length()
        graph.cal_average_clustering()

        graph.write_to_pajek_net1()

        graph.draw_community()
        #graph.draw_graph()
        #graph.draw_graph_spring_layout()
        #graph.draw_graph_random()

        print "[INFO]: author_collaboration_network_analysis is finished!"
    except Exception,e:
        print traceback.print_exc()

def test_for_srx():
    try:

        graph = MyGraph()
        graph.set_output_path(OUTPUT_PATH)

        for line in file(INPUT_PATH, "r"):
            u = line.split(',')[0]
            v = line.split(',')[1]

            graph.add_edge(u, v)

        print "[INFO]: graph is finished!"


        graph.cal_average_clustering()
        graph.cal_average_shortest_path_length_in_max_connected_component_subgraph()
        graph.cal_degree_distribution()
        graph.cal_density()
        graph.cal_transitivity()


    except Exception,e:
        print traceback.print_exc()

def test_for_jcx():
    try:
        graph = MyGraph()
        graph.set_output_path(OUTPUT_PATH)
        cnt = 0
        for line in file(INPUT_PATH,"r"):
            u =line.split()[0]
            v =line.split()[1]

            graph.add_edge(u,v)
            cnt += 1

            if(cnt == 10000):
                break;
        print "[INFO]: graph is finished!"

        '''
        graph.cal_average_clustering()
        graph.cal_average_shortest_path_length_in_max_connected_component_subgraph()
        graph.cal_degree_distribution()
        graph.cal_density()
        graph.cal_transitivity()
        '''

        title = "Social Network - Live Journal"
        graph.draw_graph(title)

    except Exception,e:
        print traceback.print_exc()

def main():
    try:

        print "[INFO]: Programme is running......"

        # parse the xml and get the result
        #a_obj1 = XmlParser(XML_PATH1, STOP_WORDS_PATH)
        #a_obj2 = XmlParser(XML_PATH2, STOP_WORDS_PATH)

        #statical_analysis(a_obj1, OUTPUT_PATH1)
        #statical_analysis(a_obj2, OUTPUT_PATH2)

        #author_collaboration_network_analysis(a_obj1, OUTPUT_PATH1)


        test_for_srx()

        print "[INFO]: Programme terminated successfully!"

    except Exception, e:
        print traceback.print_exc()


main()

From:http://blog.csdn.net/kang_tju/article/details/54589306

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值