该文本包括词频统计和词云图的Python代码。
一、词频统计:
import pandas as pd
import jieba
import re
from collections import Counter
# 定义清洗分词结果的函数
def clean_words(words):
# 移除单个汉字
words = [word for word in words if len(word) >= 2]
# 移除常规标点符号
words = [re.sub(r'[^\w\s]', '', word) for word in words]
return list(filter(None, words)) # 过滤空字符串
# 读取Excel文件
excel_path = r'待分析文本.csv'
df = pd.read_csv(excel_path)
summary_col = df.columns[0]
# 初始化空列表以存储所有分词结果
all_words = []
# 读取停用词
stopwords = set()
with open(r'汇总停用词.txt', 'r', encoding='utf-8') as f:
for line in f:
stopwords.add(line.strip())
# 对每一行的摘要进行分词处理
for summary in df[summary_col]:
words = jieba.cut(summary)
# 清洗分词结果
cleaned_words = clean_words(list(words))
# 移除停用词
cleaned_words = [word for word in cleaned_words if word not in stopwords]
# 将分词结果添加到all_words中
all_words.extend(cleaned_words)
# 将分词结果写入文本文件
output_path = r"分词结果.txt"
with open(output_path, 'w', encoding='utf-8') as f:
for word in all_words:
f.write(f"{word}\n")
# 读取分词结果并统计词频
with open(output_path, 'r', encoding='utf-8') as f:
words = f.read().splitlines()
word_counts = Counter(words)
# 将统计结果写入文本文件
stats_output_path = r"关键词频结果.txt"
with open(stats_output_path, 'w', encoding='utf-8') as f:
for word, count in word_counts.most_common():
f.write(f"{word}: {count}\n")
二、词云图:
import numpy as np
from PIL import Image
import re
import jieba
from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS
import matplotlib.pyplot as plt
# 打开存放项目名称的txt文件
with open('待处理文本.csv','r',encoding='utf-8') as f:
word= (f.read())
f.close()
# 图片模板和字体
#image=np.array(Image.open('ditu.jpg'))
font=r'C:\Windows\Fonts\msyh.ttc'
#去掉英文,保留中文
resultword=re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\。\@\#\\\&\*\%]", "",word)
wordlist_after_jieba = jieba.cut(resultword)
wl_space_split = " ".join(wordlist_after_jieba)
# 设置停用词
sw = set(STOPWORDS)
# 关键一步
my_wordcloud = WordCloud(scale=4,font_path=font,stopwords=sw,background_color='white',max_words = 100,max_font_size = 60,random_state=20).generate(wl_space_split)
#显示生成的词云
plt.imshow(my_wordcloud)
plt.axis("off")
plt.show()
#保存生成的图片
my_wordcloud.to_file('result.jpg')