2021SC@SDUSC
简介
本文主要分析核心模块PositionRank.py的使用.
初始化
# 生成图
self.graph = nx.Graph()
""" The word graph. """
# window用于边的生成
self.window = window
build_graph方法
该方法用于使用window窗口为graph添加边,代码分析如下:
def build_graph(self, window, pos=None):
"""
该方法用于使用window窗口为graph添加边
build the word graph
:param window: window的大小
:param pos: 语义标记
:return:
"""
if pos is None:
# NN:名词,单数或不可数
# NNS:复数名词
# NNP:专有名词单数
# NNPS:专有名词复数
# JJ:形容词
pos = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ']
# 节点列表
seq = []
# 选择可以加入图中的节点
# 当词性满足pos的要求时,将该词加入节点
for el in self.words:
if el.pos_pattern in pos:
seq.append((el.stemmed_form, el.position, el.sentence_id))
self.graph.add_node(el.stemmed_form)
# 构造边
for i in range(0, len(seq)):
for j in range(i + 1, len(seq)):
# 如果两个节点的位置不同,并且二者之间的距离小于window,那么为两个节点之间构造边
if seq[i][1] != seq[j][1] and abs(j - i) < window:
if not self.graph.has_edge(seq[i][0], seq[j][0]):
# 如果没有边则添加边
self.graph.add_edge(seq[i][0], seq[j][0], weight=1)
else:
# 如果有边则将边的权重加1
self.graph[seq[i][0]][seq[j][0]]['weight'] += 1
添加边
如果节点 u 和 v 不在图中。边属性可以用关键字指定,也可以直接通过访问边的属性字典。
例如:
>>> G.add_edge(1, 2, weight=3)
>>> G.add_edge(1, 3, weight=7, capacity=15, length=342.7)
def add_edge(self, u_of_edge, v_of_edge, **attr):
u, v = u_of_edge, v_of_edge
# 添加节点
if u not in self._node:
self._adj[u] = self.adjlist_inner_dict_factory()
self._node[u] = self.node_attr_dict_factory()
if v not in self._node:
self._adj[v] = self.adjlist_inner_dict_factory()
self._node[v] = self.node_attr_dict_factory()
# 添加边
datadict = self._adj[u].get(v, self.edge_attr_dict_factory())
datadict.update(attr)
self._adj[u][v] = datadict
self._adj[v][u] = datadict
然而,当数据集改为中文后(该任务后期的要求),add_edge使用add_edge方法需要添加大量的遍历代码,为了方便从以元组为元素的列表中添加边,可以使用基于add_edge方法的add_edges_from方法。
def add_edges_from(self, ebunch_to_add, **attr):
"""从可迭代属性中添加边"""
for e in ebunch_to_add:
ne = len(e)
if ne == 3:
u, v, dd = e
elif ne == 2:
u, v = e
dd = {}
else:
raise NetworkXError(f"Edge tuple {e} must be a 2-tuple or 3-tuple.")
if u not in self._node:
self._adj[u] = self.adjlist_inner_dict_factory()
self._node[u] = self.node_attr_dict_factory()
if v not in self._node:
self._adj[v] = self.adjlist_inner_dict_factory()
self._node[v] = self.node_attr_dict_factory()
datadict = self._adj[u].get(v, self.edge_attr_dict_factory())
datadict.update(attr)
datadict.update(dd)
self._adj[u][v] = datadict
self._adj[v][u] = datadict
添加边的几种方式
>>> G = nx.Graph()
>>> e = (1, 2)
>>> G.add_edge(1, 2) # 指明两个节点
>>> G.add_edge(*e) # 对元组进行解包
>>> G.add_edges_from([(1, 2)]) # 从一个可迭代对象中添加边
candidate_scoring方法
该方法用于为候选词、短语打分(基于PageRank算法)
一个小插曲:我一直以为PageRank算法是”页面排名“算法,这和他的功能十分契合,然而在信息检索课程中,老师向我们介绍了PageRank算法是由谷歌创始人之一”Page“提出的,因此命名为PageRank算法。
该部分代码分析如下:
def candidate_scoring(self, pos=None, window=10, update_scoring_method=False):
"""
该方法用于为候选词、短语打分(基于PageRank算法)
:param pos: 语义标记
:param window: window的大小
:param update_scoring_method: if you want to update the scoring method based on my paper cited below:
Florescu, Corina, and Cornelia Caragea. "A New Scheme for Scoring Phrases in Unsupervised Keyphrase Extraction."
European Conference on Information Retrieval. Springer, Cham, 2017.
:return:
"""
if pos is None:
pos = ['NN', 'NNS', 'NNP', 'NNPS', 'JJ']
# 建立词图
self.build_graph(window=window, pos=pos)
# 去除掉不可能为关键词的候选词
self.filter_candidates(max_phrase_length=4, min_word_length=3, valid_punctuation='-.')
# 计算每一个词的分数
# 为每一个单词赋值为 1/position_in_the_doc(出现的位置)
# 为权重进行归一化(可以回顾关于论文解读时提到的pagerank方法)
personalization = {}
for w in self.words:
stem = w.stemmed_form
poz = w.position
pos = w.pos_pattern
if pos in pos:
if stem not in personalization.keys():
personalization[stem] = 1.0 / poz
else:
personalization[stem] = personalization.get(stem) + 1.0 / poz
# 计算归一化系数
factor = 1.0 / sum(personalization.values())
# 归一化
normalized_personalization = {k: v * factor for k, v in personalization.items()}
# 迭代计算每一个单词的得分
pagerank_weights = nx.pagerank_scipy(self.graph, personalization=normalized_personalization, weight='weight')
# 遍历每一个候选词
if update_scoring_method:
for c in self.candidates:
if len(c.stemmed_form.split()) > 1:
self.weights[c.stemmed_form] = [stem.stemmed_form for stem in self.candidates].count(
c.stemmed_form) * \
len(c.stemmed_form.split()) / sum(
[1.0 / pagerank_weights[t] for t in c.stemmed_form.split()])
else:
self.weights[c.stemmed_form] = pagerank_weights[c.stemmed_form]
else:
for c in self.candidates:
self.weights[c.stemmed_form] = sum([pagerank_weights[t] for t in c.stemmed_form.split()])