序言
jieba.analyse.textrank()算法支持提取关键词,但是因为太通用了,所以对某些词频低但是重要性强的词语无法提取出来
如我的毕设对政府工作报告的文本分析中,词频最高的是“发展”和“建设”,但是这两个词太宽泛了,反而不如词频相对较低的“科技”、“民生”、“生态”等更能体现政府工作,所以对其进行了修改
主要是进入这个模块,把他的核心类以及相关类、变量、import都给复制到本地即可。
TextRank类
按住ctrl点进jieba.analyse,找到class TextRank,复制进来
看到他需要keywordExtractor类,按住ctrl点进去,把这个复制进来
里面又创建了UndirectWeightedGraph对象,点进去再复制进来
# TextRank类,其内包含textrank的核心方法
class TextRank(KeywordExtractor):
def __init__(self):
self.tokenizer = self.postokenizer = jieba.posseg.dt
self.span = 5
def textrank(self, sentence,dealwordsList,bestRank=18,betterRank=14,normalRank=5, topK=50, withWeight=False):#调参时去掉三个参数值
g = UndirectWeightedGraph()#创建UndirectWeightedGraph(无向有权图)对象
cm = defaultdict(int)#创建int型的默认字典,用于存放边与边之间的权重
#此处的for循环即改变权重的部分,请根据自己的数据情况决定
for i,wp in enumerate(dealwordsList):
if len(wp.strip()) >= 2:
for j in xrange(i+1,i+self.span):
if j>= len(dealwordsList):
break
if wp in topicMain or dealwordsList[j] in topicMain:
cm[(wp, dealwordsList[j])] += bestRank
for k in range(len(topicGroup)):
if wp in topicGroup[k][1:2] or dealwordsList[j] in topicGroup[k][1:2]:
cm[(wp, dealwordsList[j])] += betterRank
if wp in topicGroup[k][3:] or dealwordsList[j] in topicGroup[k][3:]:
cm[(wp, dealwordsList[j])] += normalRank
cm[(wp, dealwordsList[j])] += 1
# 依次遍历字典的每个元素,将词i,词j作为一条边起始点和终止点,将两词共现的次数w作为边的权重
for terms, w in cm.items():
g.addEdge(terms[0], terms[1], w)#往无向有权图中加入各个节点及相应的权重
nodes_rank = g.rank()#无向有权图对象调用rank算法,得到各个词语的权重
# 判断是否要求带权重,如要求则降序输出带权重的列表
if withWeight:
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
# 判断是否要求topk个词作为关键词
if topK:
return tags[:topK]
else:
return tags
extract_tags = textrank
KeywordExtractor类
# KeywordExtractor类,其内进行停用词的载入
class KeywordExtractor(object):
STOP_WORDSDICT = {}.fromkeys([ line.rstrip() for line in open('stopwords.txt',encoding='utf-8') ])
STOP_WORDS = list(STOP_WORDSDICT.keys())
def set_stop_words(self, stop_words_path):
abs_path = _get_abs_path(stop_words_path)
if not os.path.isfile(abs_path):
raise Exception("jieba: file does not exist: " + abs_path)
content = open(abs_path, 'rb').read().decode('utf-8')
for line in content.splitlines():
self.stop_words.add(line)
def extract_tags(self, *args, **kwargs):
raise NotImplementedError
UndirectWeightedGraph类
# UndirectWeightedGraph无向有权图对象,其内包含计算权重值的rank方法
class UndirectWeightedGraph:
d = 0.85
def __init__(self):
self.graph = defaultdict(list)
#将词i、词j作为一条边起始点和终止点,并录入权重值
def addEdge(self, start, end, weight):
self.graph[start].append((start, end, weight))
self.graph[end].append((end, start, weight))
def rank(self):
ws = defaultdict(float)#创建float型的默认字典
outSum = defaultdict(float)#创建float型的默认字典
wsdef = 1.0 / (len(self.graph) or 1.0)#每个节点权重的初始值
for n, out in self.graph.items():
ws[n] = wsdef #初始化各个节点的权重值
outSum[n] = sum((e[2] for e in out), 0.0)# 统计各个结点词语出现的次数之和
sorted_keys = sorted(self.graph.keys())#构建键的排序
for x in xrange(10): # 遍历十次
for n in sorted_keys:# 遍历各个节点的键
s = 0
for e in self.graph[n]: # 遍历节点的键对应的值
s += e[2] / outSum[e[1]] * ws[e[1]]# 将这些入度结点贡献后的权值相加,贡献率 = 入度结点与结点n的共现次数 / 入度结点的所有出度的次数
ws[n] = (1 - self.d) + self.d * s # 更新节点n的权值
(min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
# 获取权值的最大值和最小值
for w in itervalues(ws):
if w < min_rank:
min_rank = w
if w > max_rank:
max_rank = w
# 对权值进行归一化
for n, w in ws.items():
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
return ws
一些参数和导包,放在最前面即可
import jieba
import os
import sys
from operator import itemgetter
from collections import defaultdict
import jieba.posseg
_get_module_path = lambda path: os.path.normpath(os.path.join(os.getcwd(),os.path.dirname(__file__), path))
_get_abs_path = jieba._get_abs_path
text_type = str
string_types = (str,)
xrange = range
iterkeys = lambda d: iter(d.keys())
itervalues = lambda d: iter(d.values())
iteritems = lambda d: iter(d.items())