一.基于TextRank的关键词提取步骤
基于TextRank的关键词提取是一种基于图的排序算法,以下是基于TextRank的关键词提取的步骤:
1.文本预处理:
清洗文本数据,去除无关字符、标点符号、停用词等。
进行分词,将文本划分为单独的词语。
2.构建图(图的节点和边):
将文本中的词语作为图的节点。根据词语之间的共现关系构建边。通常可以选择在一个窗口内共同出现的词语之间建立边。
这一步的具体操作包括以下几个阶段:
2.1 分词: 将文本进行分词,将文本拆分为一个个独立的词语。
2.2 窗口建立共现关系: 设定一个窗口大小,遍历文本中的词语。
在每个窗口内,窗口内的词语被认为是共现的。
窗口的大小可以根据具体任务和文本特点来调整。
2.3 建立图的节点和边:将文本中的每个词语作为图的节点。
对于在窗口内共现的词语,为它们之间建立一条边。
这样,如果两个词在同一个窗口内同时出现,它们之间就有一条边。
例如,考虑以下文本: "TextRank is an algorithm for keyword extraction." 分词后得到词语序列 ["TextRank", "is", "an", "algorithm", "for", "keyword", "extraction"]。
如果设置窗口大小为2,就会得到以下边的关系:
("TextRank", "is"),
("TextRank", "an"),
("is", "an"),
("is", "algorithm"),
("an", "algorithm"),
("an", "for"),
("algorithm", "for"),
("algorithm", "keyword"),
("for", "keyword"),
("for", "extraction"),
("keyword", "extraction")
在这里,每个词语对应一个节点,窗口内共现的词语之间构建边。
这样,通过窗口内的共现关系,就建立了图的节点和边,为后续的TextRank算法提供了基础。
在TextRank的迭代过程中,节点之间的权重将根据共现关系进行调整,最终得到关键词的排序。
2.4 计算节点之间的权重:为图中的每个节点计算权重。
通常使用词语之间的共现次数或者其他相关度作为权重。
2.5 利用TextRank算法进行迭代:
初始化节点的权重。
迭代更新节点的权重,直到收敛。TextRank的更新公式类似于PageRank算法,涉及到节点之间的传播和权重的调整。
2.6 根据节点的权重排序:
根据节点的权重值进行排序,选择排名靠前的词作为关键词。
2.7 提取关键词:
根据排序后的节点,选择排名靠前的词作为关键词。
可以根据需求选择提取的关键词数量。
整个流程涉及到图的构建和基于图的算法进行迭代,TextRank考虑了词语之间的关系,具有一定的上下文信息,因此在关键词提取中较为有效。这种基于图的关键词提取方法不仅可以应用于单一文档,还可以用于多篇文档的关键词提取,提高了对文本的整体理解。
二.基于TextRank的关键词提取的代码实现
textrank基本代码实现:
概念可以参考:https://zhuanlan.zhihu.com/p/359232044
很详细
import networkx as nx
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk import pos_tag
# 示例文本
text = "TextRank is an algorithm for keyword extraction. It is based on graph theory and used for ranking words in a text."
# 分词和词性标注
def preprocess_text(text):
words = word_tokenize(text)
words = [word.lower() for word in words if word.isalpha()] # 去除非字母字符
words = [word for word in words if word not in stopwords.words("english")] # 去除停用词
words_pos = pos_tag(words) # 获取词性标注
return words_pos
# 构建共现图
def build_graph(words_pos):
G = nx.Graph()
G.add_nodes_from(set(words_pos))
for i in range(len(words_pos)-1):
for j in range(i+1, len(words_pos)):
if words_pos[i][1] == words_pos[j][1]: # 仅考虑同一词性的词语
if not G.has_edge(words_pos[i], words_pos[j]):
G.add_edge(words_pos[i], words_pos[j], weight=1)
else:
G[words_pos[i]][words_pos[j]]['weight'] += 1
return G
# TextRank算法
def textrank(G, max_iter=100, tol=1e-4, damping_factor=0.85):
nodes = list(G.nodes)
n = len(nodes)
p = {node: 1 / n for node in nodes}
for _ in range(max_iter):
new_p = {node: (1 - damping_factor) / n + damping_factor * sum(G[neighbor][node]['weight'] * p[neighbor] / sum(G[neighbor][neighbor2]['weight'] for neighbor2 in G.neighbors(neighbor)) for neighbor in G.neighbors(node)) for node in nodes}
if sum(abs(new_p[node] - p[node]) for node in nodes) < tol:
break
p = new_p
return p
# 获取关键词
def get_keywords(text, top_n=5):
words_pos = preprocess_text(text)
G = build_graph(words_pos)
scores = textrank(G)
sorted_scores = sorted(scores.items(), key=lambda x: x[1], reverse=True)
keywords = [word[0][0] for word in sorted_scores[:top_n]]
return keywords
# 输出关键词
keywords = get_keywords(text)
print("Keywords:", keywords)
官方给出的代码:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import absolute_import, unicode_literals
import sys
from operator import itemgetter
from collections import defaultdict
import jieba.posseg
from .tfidf import KeywordExtractor
from .._compat import *
class UndirectWeightedGraph:
d = 0.85
def __init__(self):
self.graph = defaultdict(list)
def addEdge(self, start, end, weight):
# use a tuple (start, end, weight) instead of a Edge object
self.graph[start].append((start, end, weight))
self.graph[end].append((end, start, weight))
def rank(self):
ws = defaultdict(float)
outSum = defaultdict(float)
wsdef = 1.0 / (len(self.graph) or 1.0)
for n, out in self.graph.items():
ws[n] = wsdef
outSum[n] = sum((e[2] for e in out), 0.0)
# this line for build stable iteration
sorted_keys = sorted(self.graph.keys())
for x in xrange(10): # 10 iters
for n in sorted_keys:
s = 0
for e in self.graph[n]:
s += e[2] / outSum[e[1]] * ws[e[1]]
ws[n] = (1 - self.d) + self.d * s
(min_rank, max_rank) = (sys.float_info[0], sys.float_info[3])
for w in itervalues(ws):
if w < min_rank:
min_rank = w
if w > max_rank:
max_rank = w
for n, w in ws.items():
# to unify the weights, don't *100.
ws[n] = (w - min_rank / 10.0) / (max_rank - min_rank / 10.0)
return ws
class TextRank(KeywordExtractor):
def __init__(self):
self.tokenizer = self.postokenizer = jieba.posseg.dt
self.stop_words = self.STOP_WORDS.copy()
self.pos_filt = frozenset(('ns', 'n', 'vn', 'v'))
self.span = 5
def pairfilter(self, wp):
return (wp.flag in self.pos_filt and len(wp.word.strip()) >= 2
and wp.word.lower() not in self.stop_words)
def textrank(self, sentence, topK=20, withWeight=False, allowPOS=('ns', 'n', 'vn', 'v'), withFlag=False):
"""
Extract keywords from sentence using TextRank algorithm.
Parameter:
- topK: return how many top keywords. `None` for all possible words.
- withWeight: if True, return a list of (word, weight);
if False, return a list of words.
- allowPOS: the allowed POS list eg. ['ns', 'n', 'vn', 'v'].
if the POS of w is not in this list, it will be filtered.
- withFlag: if True, return a list of pair(word, weight) like posseg.cut
if False, return a list of words
"""
self.pos_filt = frozenset(allowPOS)
g = UndirectWeightedGraph()
cm = defaultdict(int)
words = tuple(self.tokenizer.cut(sentence))
for i, wp in enumerate(words):
if self.pairfilter(wp):
for j in xrange(i + 1, i + self.span):
if j >= len(words):
break
if not self.pairfilter(words[j]):
continue
if allowPOS and withFlag:
cm[(wp, words[j])] += 1
else:
cm[(wp.word, words[j].word)] += 1
for terms, w in cm.items():
g.addEdge(terms[0], terms[1], w)
nodes_rank = g.rank()
if withWeight:
tags = sorted(nodes_rank.items(), key=itemgetter(1), reverse=True)
else:
tags = sorted(nodes_rank, key=nodes_rank.__getitem__, reverse=True)
if topK:
return tags[:topK]
else:
return tags
extract_tags = textrank
下面是使用封装好的代码实现的:
# coding=utf-8
import pandas as pd
import jieba.analyse
# 处理标题和摘要,提取关键词
def words_textrank(data, topK):
idList, titleList, abstractList = data['id'], data['title'], data['abstract']
ids, titles, keys = [], [], []
for index in range(len(idList)):
# 拼接标题和摘要
text = '%s。%s' % (titleList[index], abstractList[index])
jieba.analyse.set_stop_words("data/stopWord.txt") # 加载自定义停用词表
print("\"", titleList[index], "\"", " 10 Keywords - TextRank :")
# TextRank关键词提取,词性筛选
keywords = jieba.analyse.textrank(text, topK=topK,
allowPOS=('n', 'nz', 'v','vd', 'vn','l', 'a', 'd'))
word_split = " ".join(keywords)
keys.append(word_split.encode("utf-8").decode("utf-8"))
ids.append(idList[index])
titles.append(titleList[index])
result = pd.DataFrame({"id": ids, "title": titles, "key": keys}, columns=['id', 'title', 'key'])
return result
if __name__ == '__main__':
dataFile = 'data/text.csv'
data = pd.read_csv(dataFile)
result = words_textrank(data, 10)
result.to_csv("result/textrank.csv", index=False)