词频统计和词云图的运行代码

最新推荐文章于 2024-06-14 23:17:54 发布

criket

最新推荐文章于 2024-06-14 23:17:54 发布

阅读量170

点赞数 1

文章标签： c# 开发语言

本文链接：https://blog.csdn.net/criket/article/details/139630891

版权

该文本包括词频统计和词云图的Python代码。

一、词频统计：

import pandas as pd  
import jieba  
import re  
from collections import Counter  
  
# 定义清洗分词结果的函数  
def clean_words(words):  
    # 移除单个汉字  
    words = [word for word in words if len(word) >= 2]  
    # 移除常规标点符号  
    words = [re.sub(r'[^\w\s]', '', word) for word in words]  
    return list(filter(None, words))  # 过滤空字符串  
  
# 读取Excel文件  
excel_path = r'待分析文本.csv'  
df = pd.read_csv(excel_path)  
summary_col = df.columns[0]  
  
# 初始化空列表以存储所有分词结果  
all_words = []  
  
# 读取停用词  
stopwords = set()  
with open(r'汇总停用词.txt', 'r', encoding='utf-8') as f:  
    for line in f:  
        stopwords.add(line.strip())  
  
# 对每一行的摘要进行分词处理  
for summary in df[summary_col]:  
    words = jieba.cut(summary)  
    # 清洗分词结果  
    cleaned_words = clean_words(list(words))  
    # 移除停用词  
    cleaned_words = [word for word in cleaned_words if word not in stopwords]  
    # 将分词结果添加到all_words中  
    all_words.extend(cleaned_words)  
  
# 将分词结果写入文本文件  
output_path = r"分词结果.txt"
with open(output_path, 'w', encoding='utf-8') as f:  
    for word in all_words:  
        f.write(f"{word}\n")  
  
# 读取分词结果并统计词频  
with open(output_path, 'r', encoding='utf-8') as f:  
    words = f.read().splitlines()  
    word_counts = Counter(words)  
  
# 将统计结果写入文本文件  
stats_output_path = r"关键词频结果.txt"  
with open(stats_output_path, 'w', encoding='utf-8') as f:  
    for word, count in word_counts.most_common():  
        f.write(f"{word}: {count}\n")

二、词云图：

import numpy as np
from PIL import Image
import re
import jieba
from wordcloud import WordCloud,ImageColorGenerator,STOPWORDS
import  matplotlib.pyplot as plt

# 打开存放项目名称的txt文件
with open('待处理文本.csv','r',encoding='utf-8') as f:
    word= (f.read())
    f.close()

# 图片模板和字体
#image=np.array(Image.open('ditu.jpg')) 
font=r'‪C:\Windows\Fonts\msyh.ttc'

#去掉英文，保留中文 
resultword=re.sub("[A-Za-z0-9\[\`\~\!\@\#\$\^\&\*\(\)\=\|\{\}\'\:\;\'\,\[\]\.\<\>\/\?\~\。\@\#\\\&\*\%]", "",word) 
wordlist_after_jieba = jieba.cut(resultword) 
wl_space_split = " ".join(wordlist_after_jieba) 

# 设置停用词 
sw = set(STOPWORDS) 

# 关键一步
my_wordcloud = WordCloud(scale=4,font_path=font,stopwords=sw,background_color='white',max_words = 100,max_font_size = 60,random_state=20).generate(wl_space_split) 

#显示生成的词云 
plt.imshow(my_wordcloud)
plt.axis("off") 
plt.show() 

#保存生成的图片
my_wordcloud.to_file('result.jpg')