结巴分词,停用词生成词云图

直接上代码

# -*- coding: utf-8 -*-
# TIME: 2020/8/25
from collections import Counter

import jieba.analyse
import matplotlib.pyplot as plt
import jieba,re
from wordcloud import wordcloud


def read_file(file_name):
    """
    读文件去掉换行符
    """
    fp = open(file_name, "r", encoding="utf-8")
    content_lines = fp.readlines()
    fp.close()
    #去除行末的换行符,否则会在停用词匹配的过程中产生干扰
    for i in range(len(content_lines)):
        content_lines[i] = content_lines[i].rstrip("\n")
    return content_lines

def save_file(file_name, content):
    fp = open(file_name, "w", encoding="utf-8")
    fp.write(content)
    fp.close()

def regex_change(line):
    #前缀的正则
    username_regex = re.compile(r"^\d+::")
    #URL,为了防止对中文的过滤,所以使用[a-zA-Z0-9]而不是\w
    url_regex = re.compile(r"""
        (https?://)?
        ([a-zA-Z0-9]+)
        (\.[a-zA-Z0-9]+)
        (\.[a-zA-Z0-9]+)*
        (/[a-zA-Z0-9]+)*
    """, re.VERBOSE|re.IGNORECASE)
    #剔除日期
    data_regex = re.compile(u"""        #utf-8编码
        年 |
        月 |
        日 |
        (周一) |
        (周二) | 
        (周三) | 
        (周四) | 
        (周五) | 
        (周六)
    """, re.VERBOSE)
    #剔除所有数字
    decimal_regex = re.compile(r"[^a-zA-Z]\d+")
    #剔除空格
    space_regex = re.compile(r"\s+")

    line = username_regex.sub(r"", line)
    line = url_regex.sub(r"", line)
    line = data_regex.sub(r"", line)
    line = decimal_regex.sub(r"", line)
    line = space_regex.sub(r"", line)

    return line

def delete_stopwords(lines):
    # 剔除停用词
    stopwords = read_file("stop_words.txt")
    all_words = []

    for line in lines:
        all_words += [word for word in jieba.cut(line) if word not in stopwords]

    # dict_words = dict(Counter(all_words))

    return all_words


def Generate_WordsCloud(cut_text):


    result = " ".join(cut_text)
    # 4.生成词云
    wc = wordcloud.WordCloud(
        font_path='simsun.ttf',  # 字体路劲
        background_color='white',  # 背景颜色
        width=1000,
        height=600,
        max_font_size=50,  # 字体大小
        min_font_size=10,
        mask=plt.imread('timg2.jpg'),  # 背景图片
        max_words=1000
    )
    wc.generate(result)
    wc.to_file('jielun.png')  # 图片保存

    # 5.显示图片
    plt.figure('jielun')  # 图片显示的名字
    plt.imshow(wc)
    plt.axis('off')  # 关闭坐标
    plt.show()
    plt.close()


if __name__ == '__main__':

    lines = read_file('txt.txt')
    for i in range(len(lines)):
        lines[i] = regex_change(lines[i])
    bow_words = delete_stopwords(lines)
    Generate_WordsCloud(bow_words)
    # print(bow_words)









# # 1.读出词语
# text = open('txt.txt', 'r', encoding='utf-8').read()
# # print(text)
# # 2.把歌词剪开
#
# cut_text = jieba.analyse.extract_tags(text, topK=1000, withWeight=False, allowPOS=("ns", "n", "vn", "v", "nr"))
# print(cut_text)
# # print(cut_text)
# # print(type(cut_text))
# # print(next(cut_text))
# # print(next(cut_text))
# # 3.以空格拼接起来
# result = " ".join(cut_text)
# # print(result)
# # 4.生成词云
# wc = wordcloud.WordCloud(
#     font_path='simsun.ttf',  # 字体路劲
#     background_color='white',  # 背景颜色
#     width=1000,
#     height=600,
#     max_font_size=50,  # 字体大小
#     min_font_size=10,
#     mask=plt.imread('timg2.jpg'),  # 背景图片
#     max_words=1000
# )
# wc.generate(result)
# wc.to_file('jielun.png')  # 图片保存
#
# # 5.显示图片
# plt.figure('jielun')  # 图片显示的名字
# plt.imshow(wc)
# plt.axis('off')  # 关闭坐标
# plt.show()
# plt.close()

 

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值