关键短语抽取

丢了个猪

已于 2024-08-19 08:39:24 修改

阅读量342

点赞数 5

文章标签：开发语言 python

于 2024-08-16 16:54:10 首次发布

本文链接：https://blog.csdn.net/weixin_43382342/article/details/141264218

版权

关键短语抽取

# -*- coding: utf-8 -*-
import os

import jieba
import jieba.analyse
import re
import numpy as np
from wordcloud import WordCloud
from PIL import Image

CURRENT_PATH = os.path.abspath(__file__)
DATA_PATH = os.path.dirname(os.path.dirname(CURRENT_PATH))
FONT_PATH = os.path.join(DATA_PATH, "data", "simfang.ttf")
MASK_PATH = os.path.join(DATA_PATH, "data", "map.png")


class KeyPhraseExtraction():
    def __init__(self, topk=50, method='tfidf', with_word=True):
        """
        :param topk: 根据前多少关键词生成短语
        :param method: tfidf / textrank
        :param with_word: 最后输出结果是否包含关键词
        """
        self.topk = topk
        self.method = method
        self.with_word = with_word

    def cut_sentences(self, text):
        """文本分句，然后分词"""
        sentences = re.findall(".*?[。？！]", text)
        cut_sentences = [jieba.lcut(sent) for sent in sentences]
        return cut_sentences

    def key_words_extraction(self, text):
        """提取关键词"""
        keywords_score = []
        if self.method == 'tfidf':
            keywords_score = jieba.analyse.extract_tags(text, topK=self.topk, withWeight=True)
        elif self.method == 'textrank':
            keywords_score = jieba.analyse.textrank(text, topK=self.topk, withWeight=True)
        return {word: score for word, score in keywords_score}

    def key_phrase_extraction(self, text):
        keyword_score = self.key_words_extraction(text)
        keywords = keyword_score.keys()
        cut_sentences = self.cut_sentences(text)
        # print(keywords)
        # 将相邻的关键词进行拼接
        key_phrase = []
        for sent in cut_sentences:
            temp = []
            for word in sent:
                if word in keywords:
                    temp.append(word)
                else:
                    if len(temp) > 1:
                        if temp not in key_phrase:
                            key_phrase.append(temp)
                    temp = []

        # 短语之间可能存在冗余信息，进行过滤
        key_phrase_filter = []
        for phrase in key_phrase:
            flag = False
            for item in key_phrase_filter:
                if len(set(phrase) & set(item)) >= min(len(set(phrase)), len(set(item))) / 2.0:
                    flag = True
                    break
            if not flag:
                key_phrase_filter.append(phrase)

        # 给短语赋值权重, 设置短语最多包含三个关键词
        keyphrase_weight = {''.join(phrase[-3:]): np.mean([keyword_score[word] for word in phrase[-3:]])
                            for phrase in key_phrase_filter}

        if self.with_word:
            key_phrase_str = '|'.join(keyphrase_weight)
            for word, weight in keyword_score.items():
                if word not in key_phrase_str:
                    keyphrase_weight[word] = weight
        keyphrase_weight = dict(sorted(keyphrase_weight.items(), key=lambda x: x[1], reverse=True)[:self.topk])

        return keyphrase_weight

    def wordcloud(self, keyphrrase_weight, save_path='./wordcloud.png', with_mask=False, mask_pic=MASK_PATH):
        font = FONT_PATH
        mask = mask_pic
        mask = np.array(Image.open(mask))
        if with_mask:
            wc = WordCloud(
                background_color='white',
                width=800,
                height=800,
                mask=mask,
                font_path=font,
                # stopwords=stopword
            )
        else:
            wc = WordCloud(
                background_color='white',
                width=800,
                height=800,
                # mask=mask,
                font_path=font,
                # stopwords=stopword
            )
        wc.generate_from_frequencies(keyphrrase_weight)  # 绘制图片
        wc.to_file(save_path)  # 保存图片

def keyphrase_extract(text, topk=100, method='tfidf', with_word=False, save_pic="./wordcloud.png", with_mask=True):
    """
    关键短语抽取
    :param topk: 提取多少关键词组成短语
    :param method: 提取关键词的方法
    :param with_word: 关键词是否作为短语进行输出
    :param save_pic: 是否生成词云图片，保存路径
    :param with_mask: 生成图片是否使用背景
    :return:
    """
    key_phrase_extractor = KeyPhraseExtraction(topk=topk, method=method, with_word=with_word)
    key_phrase = key_phrase_extractor.key_phrase_extraction(text)
    print("keyphrase result: {}\n".format(key_phrase))
    if save_pic:
        key_phrase_extractor.wordcloud(key_phrase, save_path=save_pic, with_mask=True)
        print("word cloud save to: {}\n".format(save_pic))

if __name__ == '__main__':
    text = """
果真，又是一起恶意举报投诉造谣污蔑事件。
现在的网络也不是三年之前的网络环境了，
一篇小作文可以让人死无葬身之地，
现在网友都聪明了，小作文没人信，
这些人又换了套路，恶意举报了，
就好像有些人打电话恶意举报投诉，
换着号码，换着姓，捏着鼻音，
    """
    keyphrase_extract(text)

返回结果

keyphrase result: {'造谣污蔑事件': 0.22837583667026315}

【附件】simfang.ttf
【附件】map.png

丢了个猪

关注

5
点赞
踩
3

收藏

觉得还不错? 一键收藏
打赏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫