直接上代码
# -*- coding: utf-8 -*-
# TIME: 2020/8/25
from collections import Counter
import jieba.analyse
import matplotlib.pyplot as plt
import jieba,re
from wordcloud import wordcloud
def read_file(file_name):
"""
读文件去掉换行符
"""
fp = open(file_name, "r", encoding="utf-8")
content_lines = fp.readlines()
fp.close()
#去除行末的换行符,否则会在停用词匹配的过程中产生干扰
for i in range(len(content_lines)):
content_lines[i] = content_lines[i].rstrip("\n")
return content_lines
def save_file(file_name, content):
fp = open(file_name, "w", encoding="utf-8")
fp.write(content)
fp.close()
def regex_change(line):
#前缀的正则
username_regex = re.compile(r"^\d+::")
#URL,为了防止对中文的过滤,所以使用[a-zA-Z0-9]而不是\w
url_regex = re.compile(r"""
(https?://)?
([a-zA-Z0-9]+)
(\.[a-zA-Z0-9]+)
(\.[a-zA-Z0-9]+)*
(/[a-zA-Z0-9]+)*
""", re.VERBOSE|re.IGNORECASE)
#剔除日期
data_regex = re.compile(u""" #utf-8编码
年 |
月 |
日 |
(周一) |
(周二) |
(周三) |
(周四) |
(周五) |
(周六)
""", re.VERBOSE)
#剔除所有数字
decimal_regex = re.compile(r"[^a-zA-Z]\d+")
#剔除空格
space_regex = re.compile(r"\s+")
line = username_regex.sub(r"", line)
line = url_regex.sub(r"", line)
line = data_regex.sub(r"", line)
line = decimal_regex.sub(r"", line)
line = space_regex.sub(r"", line)
return line
def delete_stopwords(lines):
# 剔除停用词
stopwords = read_file("stop_words.txt")
all_words = []
for line in lines:
all_words += [word for word in jieba.cut(line) if word not in stopwords]
# dict_words = dict(Counter(all_words))
return all_words
def Generate_WordsCloud(cut_text):
result = " ".join(cut_text)
# 4.生成词云
wc = wordcloud.WordCloud(
font_path='simsun.ttf', # 字体路劲
background_color='white', # 背景颜色
width=1000,
height=600,
max_font_size=50, # 字体大小
min_font_size=10,
mask=plt.imread('timg2.jpg'), # 背景图片
max_words=1000
)
wc.generate(result)
wc.to_file('jielun.png') # 图片保存
# 5.显示图片
plt.figure('jielun') # 图片显示的名字
plt.imshow(wc)
plt.axis('off') # 关闭坐标
plt.show()
plt.close()
if __name__ == '__main__':
lines = read_file('txt.txt')
for i in range(len(lines)):
lines[i] = regex_change(lines[i])
bow_words = delete_stopwords(lines)
Generate_WordsCloud(bow_words)
# print(bow_words)
# # 1.读出词语
# text = open('txt.txt', 'r', encoding='utf-8').read()
# # print(text)
# # 2.把歌词剪开
#
# cut_text = jieba.analyse.extract_tags(text, topK=1000, withWeight=False, allowPOS=("ns", "n", "vn", "v", "nr"))
# print(cut_text)
# # print(cut_text)
# # print(type(cut_text))
# # print(next(cut_text))
# # print(next(cut_text))
# # 3.以空格拼接起来
# result = " ".join(cut_text)
# # print(result)
# # 4.生成词云
# wc = wordcloud.WordCloud(
# font_path='simsun.ttf', # 字体路劲
# background_color='white', # 背景颜色
# width=1000,
# height=600,
# max_font_size=50, # 字体大小
# min_font_size=10,
# mask=plt.imread('timg2.jpg'), # 背景图片
# max_words=1000
# )
# wc.generate(result)
# wc.to_file('jielun.png') # 图片保存
#
# # 5.显示图片
# plt.figure('jielun') # 图片显示的名字
# plt.imshow(wc)
# plt.axis('off') # 关闭坐标
# plt.show()
# plt.close()