目录
2.textrank源码中UndirectWeightedGraph类方法分解解析
(2)添加边的函数def addEdge(self, start, end, weight)
(3)def rank(self)函数(个人觉得在这个无向有权图类中最重要的一部分)
3.textrank源码中TextRank(KeywordExtractor)类的代码分片解释
(2) def pairfilter(self, wp)函数
1.textrank源码解析
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import sys
from operator import itemgetter
from collections import defaultdict
import jieba.posseg
from .tfidf import KeywordExtractor
from .._compat import *
class UndirectWeightedGraph:
d = 0.85
def __init__(self):
self.graph = defaultdict(list)#这是进行分词后的一个词典
def addEdge(self, start, end, weight):
# use a tuple (start, end, weight) instead of a Edge object
self.graph[start].append((start, end, weight))
self.graph[end].append((end, start, weight))
def rank(self):
ws = defaultdict(float)#权值list表
outSum = defaultdict(float)
# 初始化各个结点的权值
# 统计各个结点的出度的次数之和
wsdef = 1.0 / (len(self.graph) or 1.0)
for n, out in self.graph.items():
ws[n] = wsdef
outSum[n] = sum((e[2] for e in out), 0.0)#e[2]是什么?
# this line for build stable iteration
sorted_keys = sorted(self.graph.keys())
# 遍历若干次
for x in xrange(10): # 10 iters
#遍历各个节点
for n in sorted_keys:
s = 0
# 遍历结点的入度结点
for e in self.graph[n]:
# 将这些入度结点贡献后的权值相加
# 贡献率 = 入度结点与结点n的共现次数 / 入度结点的所有出度的次数
s += e[2] / outSum[e[1]] * ws[e[1]]
# 更新结点n的权值
ws[n] = (1 - self.d) + self.d * s
(min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
# 获取权值的最大值和最小值
for w in itervalues(ws):
if w < min_rank:
min_rank = w
if w > max_rank:
max_rank = w
# 对权值进行归一化
for n, w in ws.items():
# to unify the weights, don't *100.
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
return ws
class TextRank(KeywordExtractor):
def __init__(self):
#初始化时,默认加载分词函数tokenizer = jieba.dt以及词性标注工具jieba.posseg.dt,停用词stop_words = self.STOP_WORDS.copy(),
#词性过滤集合pos_filt = frozenset(('ns', 'n', 'vn', 'v')),窗口span = 5,(("ns", "n", "vn", "v"))表示词性为地名、名词、动名词、动词。
self.tokenizer = self.postokenizer = jieba.posseg.dt
self.stop_words = self.STOP_WORDS.copy()
self.pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
self.span = 5
def pairfilter(self, wp):
return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2
and wp.word.lower() not in self.stop_words)
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
"""
Extract keywords from sentence using TextRank algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
if the POS of w is not in this list, it will be filtered.
- withFlag: if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
self.pos_filt = frozenset(allowPOS)
#定义无向有权图
g = UndirectWeightedGraph()
#定义共现词典
cm = defaultdict(int)
#分词
words = tuple(self.tokenizer.cut(sentence))
#一次遍历每个词
for i, wp in enumerate(words):
#词i满足过滤条件
if self.pairfilter(wp):
# 依次遍历词i 之后窗口范围内的词
for j in xrange(i + 1, i + self.span):
# 词j 不能超出整个句子