关键短语抽取
# -*- coding: utf-8 -*-
import os
import jieba
import jieba.analyse
import re
import numpy as np
from wordcloud import WordCloud
from PIL import Image
CURRENT_PATH = os.path.abspath(__file__)
DATA_PATH = os.path.dirname(os.path.dirname(CURRENT_PATH))
FONT_PATH = os.path.join(DATA_PATH, "data", "simfang.ttf")
MASK_PATH = os.path.join(DATA_PATH, "data", "map.png")
class KeyPhraseExtraction():
def __init__(self, topk=50, method='tfidf', with_word=True):
"""
:param topk: 根据前多少关键词生成短语
:param method: tfidf / textrank
:param with_word: 最后输出结果是否包含关键词
"""
self.topk = topk
self.method = method
self.with_word = with_word
def cut_sentences(self, text):
"""文本分句,然后分词"""
sentences = re.findall(".*?[。?!]", text)
cut_sentences = [jieba.lcut(sent) for sent in sentences]
return cut_sentences
def key_words_extraction(self, text):
"""提取关键词"""
keywords_score = []
if self.method == 'tfidf':
keywords_score = jieba.analyse.extract_tags(text, topK=self.topk, withWeight=True)
elif self.method == 'textrank':
keywords_score = jieba.analyse.textrank(text, topK=self.topk, withWeight=True)
return {word: score for word, score in keywords_score}
def key_phrase_extraction(self, text):
keyword_score = self.key_words_extraction(text)
keywords = keyword_score.keys()
cut_sentences = self.cut_sentences(text)
# print(keywords)
# 将相邻的关键词进行拼接
key_phrase = []
for sent in cut_sentences:
temp = []
for word in sent:
if word in keywords:
temp.append(word)
else:
if len(temp) > 1:
if temp not in key_phrase:
key_phrase.append(temp)
temp = []
# 短语之间可能存在冗余信息,进行过滤
key_phrase_filter = []
for phrase in key_phrase:
flag = False
for item in key_phrase_filter:
if len(set(phrase) & set(item)) >= min(len(set(phrase)), len(set(item))) / 2.0:
flag = True
break
if not flag:
key_phrase_filter.append(phrase)
# 给短语赋值权重, 设置短语最多包含三个关键词
keyphrase_weight = {''.join(phrase[-3:]): np.mean([keyword_score[word] for word in phrase[-3:]])
for phrase in key_phrase_filter}
if self.with_word:
key_phrase_str = '|'.join(keyphrase_weight)
for word, weight in keyword_score.items():
if word not in key_phrase_str:
keyphrase_weight[word] = weight
keyphrase_weight = dict(sorted(keyphrase_weight.items(), key=lambda x: x[1], reverse=True)[:self.topk])
return keyphrase_weight
def wordcloud(self, keyphrrase_weight, save_path='./wordcloud.png', with_mask=False, mask_pic=MASK_PATH):
font = FONT_PATH
mask = mask_pic
mask = np.array(Image.open(mask))
if with_mask:
wc = WordCloud(
background_color='white',
width=800,
height=800,
mask=mask,
font_path=font,
# stopwords=stopword
)
else:
wc = WordCloud(
background_color='white',
width=800,
height=800,
# mask=mask,
font_path=font,
# stopwords=stopword
)
wc.generate_from_frequencies(keyphrrase_weight) # 绘制图片
wc.to_file(save_path) # 保存图片
def keyphrase_extract(text, topk=100, method='tfidf', with_word=False, save_pic="./wordcloud.png", with_mask=True):
"""
关键短语抽取
:param topk: 提取多少关键词组成短语
:param method: 提取关键词的方法
:param with_word: 关键词是否作为短语进行输出
:param save_pic: 是否生成词云图片,保存路径
:param with_mask: 生成图片是否使用背景
:return:
"""
key_phrase_extractor = KeyPhraseExtraction(topk=topk, method=method, with_word=with_word)
key_phrase = key_phrase_extractor.key_phrase_extraction(text)
print("keyphrase result: {}\n".format(key_phrase))
if save_pic:
key_phrase_extractor.wordcloud(key_phrase, save_path=save_pic, with_mask=True)
print("word cloud save to: {}\n".format(save_pic))
if __name__ == '__main__':
text = """
果真,又是一起恶意举报投诉造谣污蔑事件。
现在的网络也不是三年之前的网络环境了,
一篇小作文可以让人死无葬身之地,
现在网友都聪明了,小作文没人信,
这些人又换了套路,恶意举报了,
就好像有些人打电话恶意举报投诉,
换着号码,换着姓,捏着鼻音,
"""
keyphrase_extract(text)
返回结果
keyphrase result: {'造谣污蔑事件': 0.22837583667026315}