2021SC@SDUSC
系列文章目录
(十二)PKE代码分析五
目录
前言
本篇博客继续对无监督模型的统计模型进行代码分析
unsupervised->statistical
yake.py
yake关键词提取模型
(一)原理
A Text Feature Based Automatic Keyword Extraction Method for Single Documents
YAKE是2018年提出的另一种流行的关键字提取算法。它在许多数据集中的表现优于TF-IDF和RAKE,并继续在ECIR 2018上获得最佳"短篇论文奖"。
YAKE使用统计功能来识别和排名最重要的关键字。它不需要任何语言信息,如NER或POS标记,因此可以与任何语言一起使用。它只需要语言的非索引字列表。
YAKE(Yet Another Keyword Extractor)是一种无监督的关键词提取算法,特征提取主要考虑五个因素(去除停用词后):
(二)使用示例
导入所需库
import pke
from nltk.corpus import stopwords
# 1. 创建一个YAKE提取器。
extractor = pke.unsupervised.YAKE()
# 2. 加载文档的内容。
extractor.load_document(input='path/to/input',
language='en',
normalization=None)
# 3. 选择{1-3}不包含标点符号且不以停止字开头/结尾的单词作为候选词。
stoplist = stopwords.words('english')
extractor.candidate_selection(n=3, stoplist=stoplist)
# 4. 使用YAKE加权方案对候选项进行加权,可以指定用于计算左/右上下文的窗口(大写)。
window = 2
use_stems = False # use stems instead of words for weighting
extractor.candidate_weighting(window=window,
stoplist=stoplist,
use_stems=use_stems)
# 5. 将得分最高的10位候选词作为关键短语。使用levenshtein距离和阈值从输出中删除冗余的关键短语。
threshold = 0.8
keyphrases = extractor.get_n_best(n=10, threshold=threshold)
(三)函数
class YAKE(LoadFile):包含8个函数
1.def __init__(self):#重新定义YAKE的初始值设定项。
词汇表的容器
super(YAKE, self).__init__()
self.words = defaultdict(set)
词上下文的容器
self.contexts = defaultdict(lambda: ([], []))
词特征的容器
self.features = defaultdict(dict)
从表面形式到词汇形式的映射
self.surface_to_lexical = {}
2.def candidate_selection(self, n=3, stoplist=None, **kwargs):
选择1-3 grams作为关键词候选词。以停止字开头或结尾的候选词将被过滤掉。不允许使用不包含至少一个字母数字字符的单词。
参数:
n(int):n-gram长度,默认为3。
stoplist(list):筛选候选项的stoplist,默认为nltk stoplist。
#从1到3grams中选择ngrams
self.ngram_selection(n=n)
#筛选包含标点符号的候选项
self.candidate_filtering(stoplist=list(string.punctuation))
#如果未提供停止列表,则初始化空列表
if stoplist is None:
stoplist = self.stoplist
#进一步筛选候选对象,获取候选词,筛选以停止字开头/结尾或包含少于3个字符的第一个/最后一个字的候选词
for k in list(self.candidates):
v = self.candidates[k]
if v.surface_forms[0][0].lower() in stoplist or v.surface_forms[0][
-1].lower() in stoplist or len(
v.surface_forms[0][0]) < 3 or len(
v.surface_forms[0][-1]) < 3:
del self.candidates[k]
3.def _vocabulary_building(self, use_stems=False):
建立词汇表,用于衡量候选词的权重。仅保留至少包含一个字母数字字符的单词。
参数:
use_stems(bool):是否使用词干而不是小写单词进行加权,默认为False。
#循环句子
for i, sentence in enumerate(self.sentences):
#计算句子的偏移量
shift = sum([s.length for s in self.sentences[0:i]])
#在句子中循环单词
for j, word in enumerate(sentence.words):
#考虑包含至少一个字母数字字符的单词
if self._is_alphanum(word) and \
not re.search('(?i)^-[lr][rcs]b-$', word):
# 得到单词或词干
index = word.lower()
if use_stems:
index = sentence.stems[j]
# 添加单词引用
self.words[index].add((shift + j, shift, i, word))
4.def _contexts_building(self, use_stems=False, window=2):
构建单词的上下文以计算关联性特征。出现在n个单词窗口内的单词被视为上下文单词。只考虑块中同时出现的单词(词汇表中出现的单词序列)。
参数:
use_stems(bool):是否使用stems而不是小写单词进行加权,默认为False。
window(int):用于计算共现计数的窗口的字大小,默认为2。
# 循环句子
for i, sentence in enumerate(self.sentences):
# 将单词小写
words = [w.lower() for w in sentence.words]
# 如有必要,更换阀杆
if use_stems:
words = sentence.stems
# block 容器
block = []
# 在句子中循环单词
for j, word in enumerate(words):
# 如果单词不在词汇表中,则跳过并刷新block
if word not in self.words:
block = []
continue
# 添加左侧上下文
self.contexts[word][0].extend(
[w for w in block[max(0, len(block) - window):len(block)]]
)
# 添加右侧上下文
for w in block[max(0, len(block) - window):len(block)]:
self.contexts[w][1].append(word)
# 将单词添加到当前block
block.append(word)
5.def _feature_extraction(self, stoplist=None):
#使用以下五个特征计算单个单词的权重:
这五个特征在本文的原理部分也有提及,即casing,position,frequency,relatedness,different,下面一一具体讲解并分析代码
#初始化停止列表(如果未提供)
if stoplist is None:
stoplist = self.stoplist
#获取每个单词的TF值
TF = [len(self.words[w]) for w in self.words]
#获取非停用词的词频
TF_nsw = [len(self.words[w]) for w in self.words if w not in stoplist]
#计算统计数据
mean_TF = numpy.mean(TF_nsw)
std_TF = numpy.std(TF_nsw)
max_TF = max(TF)
#循环读单词,以下所有操作均在此循环中进行
for word in self.words:
#指示该词是否为停用词(vitordouzi change)
self.features[word]['isstop'] = word in stoplist or len(word) < 3
#词频
self.features[word]['TF'] = len(self.words[word])
#大写/首字母缩写词 词频
self.features[word]['TF_A'] = 0
self.features[word]['TF_U'] = 0
for (offset, shift, sent_id, surface_form) in self.words[word]:
if surface_form.isupper() and len(word) > 1:
self.features[word]['TF_A'] += 1
elif surface_form[0].isupper() and offset != shift:
self.features[word]['TF_U'] += 1
# 1. CASING feature CASING:重视首字母缩写词或以大写字母开头的单词。 CASING(w) = max(TF(U(w)), TF(A(w))) / (1 + log(TF(w))) TF(U(w))是单词以大写字母开头的次数,句子开头除外。TF(A(w)) 是单词被标记为首字母缩写词的 次数。
self.features[word]['CASING'] = max(self.features[word]['TF_A'],
self.features[word]['TF_U'])
self.features[word]['CASING'] /= 1.0 + math.log(
self.features[word]['TF'])
# 2. POSITION feature
POSITION: 重视出现在文档开头的单词。 POSITION(w) = log( log( 3 + Median(Sen(w)) ) ) 其中 Sen(w) 包含 w 出现的句子的位置。
sentence_ids = list(set([t[2] for t in self.words[word]]))
self.features[word]['POSITION'] = math.log(
3.0 + numpy.median(sentence_ids))
self.features[word]['POSITION'] = math.log(
self.features[word]['POSITION'])
# 3. FREQUENCY feature
FREQUENCY: 重视频繁出现的词。 FREQUENCY(w) = TF(w) / ( MEAN_TF + STD_TF) 在有效的非停用词的字上计算MEAN_TF和STD_TF。
self.features[word]['FREQUENCY'] = self.features[word]['TF']
self.features[word]['FREQUENCY'] /= (mean_TF + std_TF)
# 4. RELATEDNESS feature
RELATEDNESS: 重视不具有停用词特征的词。 RELATEDNESS(w) = 1 + (WR+WL)*(TF(w)/MAX_TF) + PL + PR
self.features[word]['WL'] = 0.0
if len(self.contexts[word][0]):
self.features[word]['WL'] = len(set(self.contexts[word][0]))
self.features[word]['WL'] /= len(self.contexts[word][0])
self.features[word]['PL'] = len(set(self.contexts[word][0])) / max_TF
self.features[word]['WR'] = 0.0
if len(self.contexts[word][1]):
self.features[word]['WR'] = len(set(self.contexts[word][1]))
self.features[word]['WR'] /= len(self.contexts[word][1])
self.features[word]['PR'] = len(set(self.contexts[word][1])) / max_TF
self.features[word]['RELATEDNESS'] = 1
#self.features[word]['RELATEDNESS'] += self.features[word]['PL']
#self.features[word]['RELATEDNESS'] += self.features[word]['PR']
self.features[word]['RELATEDNESS'] += (self.features[word]['WR'] +
self.features[word]['WL']) * \
(self.features[word]['TF'] / max_TF)
# 5. DIFFERENT feature
DIFFERENT: 重视出现在多个句子中的词。 DIFFERENT(w) = SF(w) / # sentences 其中SF(w)是词w的句频。
self.features[word]['DIFFERENT'] = len(set(sentence_ids))
self.features[word]['DIFFERENT'] /= len(self.sentences)
计算完以上5个特征之后,最后一步
# 组合特征以加权单词
A = self.features[word]['CASING']
B = self.features[word]['POSITION']
C = self.features[word]['FREQUENCY']
D = self.features[word]['RELATEDNESS']
E = self.features[word]['DIFFERENT']
self.features[word]['weight'] = (D * B) / (A + (C / D) + (E / D))
6.def candidate_weighting(self, window=2, stoplist=None, use_stems=False):
YAKE论文中描述的候选权重计算
if not self.candidates:
return
self._vocabulary_building(use_stems=use_stems)
self._contexts_building(use_stems=use_stems, window=window)
self._feature_extraction(stoplist=stoplist)
for k, v in self.candidates.items():
if use_stems:
weights = [self.features[t]['weight'] for t in v.lexical_form]
self.weights[k] = numpy.prod(weights)
self.weights[k] /= len(v.offsets) * (1 + sum(weights))
else:
lowercase_forms = [' '.join(t).lower() for t in v.surface_forms]
for i, candidate in enumerate(lowercase_forms):
TF = lowercase_forms.count(candidate)
tokens = [t.lower() for t in v.surface_forms[i]]
prod_ = 1.
sum_ = 0.
for j, token in enumerate(tokens):
if self.features[token]['isstop']:
term_stop = token
prob_t1 = prob_t2 = 0
if j - 1 >= 0:
term_left = tokens[j-1]
prob_t1 = self.contexts[term_left][1].count(
term_stop) / self.features[term_left]['TF']
if j + 1 < len(tokens):
term_right = tokens[j+1]
prob_t2 = self.contexts[term_stop][0].count(
term_right) / self.features[term_right]['TF']
prob = prob_t1 * prob_t2
prod_ *= (1 + (1 - prob))
sum_ -= (1 - prob)
else:
prod_ *= self.features[token]['weight']
sum_ += self.features[token]['weight']
if sum_ == -1:
sum_ = -0.99999999999
self.weights[candidate] = prod_
self.weights[candidate] /= TF * (1 + sum_)
self.surface_to_lexical[candidate] = k
其中倒数第五行的if语句,是对于候选词是句子开头或结尾的单标记停用词,将 sum_ 设置为 -1+eps 所以 1+sum_ != 0
7.def is_redundant(self, candidate, prev, threshold=0.8):
测试一个候选词是否相对于已经选择的候选词列表是冗余的。 如果一个候选词与列表中排名更高的另一个候选词的编辑距离大于阈值,则该候选者被认为是冗余的。
# 遍历已经选择的候选词
for prev_candidate in prev:
dist = edit_distance(candidate, prev_candidate)
dist /= max(len(candidate), len(prev_candidate))
if (1.0 - dist) > threshold:
return True
return False
8.def get_n_best(self, n=10, redundancy_removal=True, stemming=False, threshold=0.8):
返回给定权重的 n 个最佳候选者
此方法的步骤为
# 按升序对候选词进行排序
# 删除多余的候选词
# 为非冗余候选词初始化一个新容器
# 遍历最佳候选词
# 测试候选词是否是多余的
# 否则添加候选词
# 如果找到 n-best 则中断计算
# 在最佳容器中复制非冗余候选词
# 以(词法形式,权重)元组的形式获取最佳候选词列表
# 如果没有词干则替换为表面形式
# 返回最佳候选词列表
总结
本文分析了unsupervised->statistical->yake.py的代码及原理,下篇博客将会分析其他模型