关键短语抽取

关键短语抽取

# -*- coding: utf-8 -*-
import os

import jieba
import jieba.analyse
import re
import numpy as np
from wordcloud import WordCloud
from PIL import Image

CURRENT_PATH = os.path.abspath(__file__)
DATA_PATH = os.path.dirname(os.path.dirname(CURRENT_PATH))
FONT_PATH = os.path.join(DATA_PATH, "data", "simfang.ttf")
MASK_PATH = os.path.join(DATA_PATH, "data", "map.png")


class KeyPhraseExtraction():
    def __init__(self, topk=50, method='tfidf', with_word=True):
        """
        :param topk: 根据前多少关键词生成短语
        :param method: tfidf / textrank
        :param with_word: 最后输出结果是否包含关键词
        """
        self.topk = topk
        self.method = method
        self.with_word = with_word

    def cut_sentences(self, text):
        """文本分句,然后分词"""
        sentences = re.findall(".*?[。?!]", text)
        cut_sentences = [jieba.lcut(sent) for sent in sentences]
        return cut_sentences

    def key_words_extraction(self, text):
        """提取关键词"""
        keywords_score = []
        if self.method == 'tfidf':
            keywords_score = jieba.analyse.extract_tags(text, topK=self.topk, withWeight=True)
        elif self.method == 'textrank':
            keywords_score = jieba.analyse.textrank(text, topK=self.topk, withWeight=True)
        return {word: score for word, score in keywords_score}

    def key_phrase_extraction(self, text):
        keyword_score = self.key_words_extraction(text)
        keywords = keyword_score.keys()
        cut_sentences = self.cut_sentences(text)
        # print(keywords)
        # 将相邻的关键词进行拼接
        key_phrase = []
        for sent in cut_sentences:
            temp = []
            for word in sent:
                if word in keywords:
                    temp.append(word)
                else:
                    if len(temp) > 1:
                        if temp not in key_phrase:
                            key_phrase.append(temp)
                    temp = []

        # 短语之间可能存在冗余信息,进行过滤
        key_phrase_filter = []
        for phrase in key_phrase:
            flag = False
            for item in key_phrase_filter:
                if len(set(phrase) & set(item)) >= min(len(set(phrase)), len(set(item))) / 2.0:
                    flag = True
                    break
            if not flag:
                key_phrase_filter.append(phrase)

        # 给短语赋值权重, 设置短语最多包含三个关键词
        keyphrase_weight = {''.join(phrase[-3:]): np.mean([keyword_score[word] for word in phrase[-3:]])
                            for phrase in key_phrase_filter}

        if self.with_word:
            key_phrase_str = '|'.join(keyphrase_weight)
            for word, weight in keyword_score.items():
                if word not in key_phrase_str:
                    keyphrase_weight[word] = weight
        keyphrase_weight = dict(sorted(keyphrase_weight.items(), key=lambda x: x[1], reverse=True)[:self.topk])

        return keyphrase_weight

    def wordcloud(self, keyphrrase_weight, save_path='./wordcloud.png', with_mask=False, mask_pic=MASK_PATH):
        font = FONT_PATH
        mask = mask_pic
        mask = np.array(Image.open(mask))
        if with_mask:
            wc = WordCloud(
                background_color='white',
                width=800,
                height=800,
                mask=mask,
                font_path=font,
                # stopwords=stopword
            )
        else:
            wc = WordCloud(
                background_color='white',
                width=800,
                height=800,
                # mask=mask,
                font_path=font,
                # stopwords=stopword
            )
        wc.generate_from_frequencies(keyphrrase_weight)  # 绘制图片
        wc.to_file(save_path)  # 保存图片

def keyphrase_extract(text, topk=100, method='tfidf', with_word=False, save_pic="./wordcloud.png", with_mask=True):
    """
    关键短语抽取
    :param topk: 提取多少关键词组成短语
    :param method: 提取关键词的方法
    :param with_word: 关键词是否作为短语进行输出
    :param save_pic: 是否生成词云图片,保存路径
    :param with_mask: 生成图片是否使用背景
    :return:
    """
    key_phrase_extractor = KeyPhraseExtraction(topk=topk, method=method, with_word=with_word)
    key_phrase = key_phrase_extractor.key_phrase_extraction(text)
    print("keyphrase result: {}\n".format(key_phrase))
    if save_pic:
        key_phrase_extractor.wordcloud(key_phrase, save_path=save_pic, with_mask=True)
        print("word cloud save to: {}\n".format(save_pic))

if __name__ == '__main__':
    text = """
果真,又是一起恶意举报投诉造谣污蔑事件。
现在的网络也不是三年之前的网络环境了,
一篇小作文可以让人死无葬身之地,
现在网友都聪明了,小作文没人信,
这些人又换了套路,恶意举报了,
就好像有些人打电话恶意举报投诉,
换着号码,换着姓,捏着鼻音,
    """
    keyphrase_extract(text)

返回结果

keyphrase result: {'造谣污蔑事件': 0.22837583667026315}

【附件】simfang.ttf
【附件】map.png

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

丢了个猪

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值