结巴分词，停用词生成词云图

最新推荐文章于 2024-05-07 10:59:26 发布

浩哥爱吃肉

最新推荐文章于 2024-05-07 10:59:26 发布

阅读量748

点赞数

本文链接：https://blog.csdn.net/zhh_love123/article/details/108244281

版权

直接上代码

# -*- coding: utf-8 -*-
# TIME: 2020/8/25
from collections import Counter

import jieba.analyse
import matplotlib.pyplot as plt
import jieba,re
from wordcloud import wordcloud


def read_file(file_name):
    """
    读文件去掉换行符
    """
    fp = open(file_name, "r", encoding="utf-8")
    content_lines = fp.readlines()
    fp.close()
    #去除行末的换行符，否则会在停用词匹配的过程中产生干扰
    for i in range(len(content_lines)):
        content_lines[i] = content_lines[i].rstrip("\n")
    return content_lines

def save_file(file_name, content):
    fp = open(file_name, "w", encoding="utf-8")
    fp.write(content)
    fp.close()

def regex_change(line):
    #前缀的正则
    username_regex = re.compile(r"^\d+::")
    #URL，为了防止对中文的过滤，所以使用[a-zA-Z0-9]而不是\w
    url_regex = re.compile(r"""
        (https?://)?
        ([a-zA-Z0-9]+)
        (\.[a-zA-Z0-9]+)
        (\.[a-zA-Z0-9]+)*
        (/[a-zA-Z0-9]+)*
    """, re.VERBOSE|re.IGNORECASE)
    #剔除日期
    data_regex = re.compile(u"""        #utf-8编码
        年 |
        月 |
        日 |
        (周一) |
        (周二) | 
        (周三) | 
        (周四) | 
        (周五) | 
        (周六)
    """, re.VERBOSE)
    #剔除所有数字
    decimal_regex = re.compile(r"[^a-zA-Z]\d+")
    #剔除空格
    space_regex = re.compile(r"\s+")

    line = username_regex.sub(r"", line)
    line = url_regex.sub(r"", line)
    line = data_regex.sub(r"", line)
    line = decimal_regex.sub(r"", line)
    line = space_regex.sub(r"", line)

    return line

def delete_stopwords(lines):
    # 剔除停用词
    stopwords = read_file("stop_words.txt")
    all_words = []

    for line in lines:
        all_words += [word for word in jieba.cut(line) if word not in stopwords]

    # dict_words = dict(Counter(all_words))

    return all_words


def Generate_WordsCloud(cut_text):


    result = " ".join(cut_text)
    # 4.生成词云
    wc = wordcloud.WordCloud(
        font_path='simsun.ttf',  # 字体路劲
        background_color='white',  # 背景颜色
        width=1000,
        height=600,
        max_font_size=50,  # 字体大小
        min_font_size=10,
        mask=plt.imread('timg2.jpg'),  # 背景图片
        max_words=1000
    )
    wc.generate(result)
    wc.to_file('jielun.png')  # 图片保存

    # 5.显示图片
    plt.figure('jielun')  # 图片显示的名字
    plt.imshow(wc)
    plt.axis('off')  # 关闭坐标
    plt.show()
    plt.close()


if __name__ == '__main__':

    lines = read_file('txt.txt')
    for i in range(len(lines)):
        lines[i] = regex_change(lines[i])
    bow_words = delete_stopwords(lines)
    Generate_WordsCloud(bow_words)
    # print(bow_words)









# # 1.读出词语
# text = open('txt.txt', 'r', encoding='utf-8').read()
# # print(text)
# # 2.把歌词剪开
#
# cut_text = jieba.analyse.extract_tags(text, topK=1000, withWeight=False, allowPOS=("ns", "n", "vn", "v", "nr"))
# print(cut_text)
# # print(cut_text)
# # print(type(cut_text))
# # print(next(cut_text))
# # print(next(cut_text))
# # 3.以空格拼接起来
# result = " ".join(cut_text)
# # print(result)
# # 4.生成词云
# wc = wordcloud.WordCloud(
#     font_path='simsun.ttf',  # 字体路劲
#     background_color='white',  # 背景颜色
#     width=1000,
#     height=600,
#     max_font_size=50,  # 字体大小
#     min_font_size=10,
#     mask=plt.imread('timg2.jpg'),  # 背景图片
#     max_words=1000
# )
# wc.generate(result)
# wc.to_file('jielun.png')  # 图片保存
#
# # 5.显示图片
# plt.figure('jielun')  # 图片显示的名字
# plt.imshow(wc)
# plt.axis('off')  # 关闭坐标
# plt.show()
# plt.close()