Python词云实现

最新推荐文章于 2024-07-23 20:35:10 发布

curd_boy

最新推荐文章于 2024-07-23 20:35:10 发布

阅读量4.1w

点赞数 43

分类专栏：小程序合集数据分析与数据挖掘

本文链接：https://blog.csdn.net/weixin_43746433/article/details/89856014

版权

数据分析与数据挖掘同时被 2 个专栏收录

41 篇文章 41 订阅

订阅专栏

小程序合集

5 篇文章 1 订阅

订阅专栏

1.csv文件转字典形式，统计频繁词的权重，并绘制词云图

def read_csv_to_dict(index) -> dict:
    """
    读取csv数据
    数据格式为：'用户id', '用户名', '性别', '地区', '生日', '微博id', '微博内容'
    :param index: 读取某一列 从0开始
    :return: dic属性为key，次数为value
    """
    with open(CSV_FILE_PATH, 'r', encoding='utf-8') as csvfile:
        reader = csv.reader(csvfile)
        # for columns in reader:
        #     print(columns)
        column = [columns[index] for columns in reader]
        print(column)
        dic = collections.Counter(column)#统计列表元素出现次数
        # 删除空字符串
        if '' in dic:
            dic.pop('')
        print(dic)#Counter({'我太难了': 3, '超话粉丝大咖': 2, '#周杰伦演唱会#   抵制黄牛！真正歌迷抢不到，:4})
        return dic
def analysis_sina_content():
    # 读取微博内容列
    dic = read_csv_to_dict(6)
    #print(dic)
    # 数据清洗，去掉无效词
    jieba.analyse.set_stop_words(STOP_WORDS_FILE_PATH)
    # 词数统计
    words_count_list = jieba.analyse.textrank(' '.join(dic.keys()), topK=50, withWeight=True)
        # 函数：jieba.analyse.textrank(string, topK=20, withWeight=True, allowPOS=())
        # string：待处理语句
        # topK：关键字的个数，默认20
        # withWeight：是否返回权重值，默认false
        # allowPOS：是否仅返回指定类型，默认为空
    print(words_count_list)
    #[('杭州', 1.0), ('演唱会', 0.9047694519491188), ('抢到', 0.4155709853243528), ('门票', 0.32065316150633894)]
    print(len(words_count_list))#50
    # 生成词云
    word_cloud = (
        WordCloud()
            .add("", words_count_list, word_size_range=[20, 100], shape=SymbolType.DIAMOND)
            .set_global_opts(title_opts=opts.TitleOpts(title="周杰伦打榜微博内容分析"))
    )
    word_cloud.render('word_cloud.html')

在这里插入图片描述

2.字典文件统计词频，并生成词云

python制作词云大致分这几个步骤：

1.制作停用词文本
2.读取文本并使用停用词
3.配置词云参数并生成图片形状的词云

代码如下：

'''统计词频'''
def statistics(texts, stopwords):
	words_dict = {}
	for text in texts:
		temp = jieba.cut(text)
		for t in temp:
			if t in stopwords or t == 'unknow':
				continue
			if t in words_dict.keys():
				words_dict[t] += 1
			else:
				words_dict[t] = 1
	return words_dict

'''词云'''
def drawWordCloud(words, title, savepath='./results'):
	if not os.path.exists(savepath):
		os.mkdir(savepath)
	wc = WordCloud(font_path='simkai.ttf', background_color='white', max_words=2000, width=1920, height=1080, margin=5)
	wc.generate_from_frequencies(words)
	wc.to_file(os.path.join(savepath, title+'.png'))

with open('python_61.pkl', 'rb') as f:
	data = pickle.load(f)
# 词云
	stopwords = open('./stopwords.txt', 'r', encoding='utf-8').read().split('\n')[:-1]
	texts = [j[2] for i, j in data.items()]
	words_dict = statistics(texts, stopwords)
	drawWordCloud(words_dict, 'python相关图书简介词云', savepath='./results')

在这里插入图片描述

3.txt文件统计词频并绘制词云图

import jieba.analyse
import imageio
import jieba.posseg as pseg
def jieba_cut():
    #西游记停用词
    fr = open('西游记停用词.txt', 'r')
    stop_word_list = fr.readlines()
    new_stop_word_list = []
    for stop_word in stop_word_list:
        stop_word = stop_word.replace('\ufeef', '').strip()
        new_stop_word_list.append(stop_word)
    print(stop_word_list)  #输出停用词
    #输出西游记 词语出现的次数
    fr_xyj=open('xyj.txt','r',encoding='utf-8')
    s=fr_xyj.read()
    words=jieba.cut(s,cut_all=False)
    word_dict={}
    word_list=''
    for word in words:
        if (len(word) > 1 and not word in new_stop_word_list):
            word_list = word_list + ' ' + word
            if (word_dict.get(word)):
                word_dict[word] = word_dict[word] + 1
            else:
                word_dict[word] = 1
    fr.close()
    ##print(word_list)
    #print(word_dict) #输出西游记 词语出现的次数

    #按次数进行排序
    sort_words=sorted(word_dict.items(),key=lambda x:x[1],reverse=True)
    print(sort_words[0:101])#输出前0-100的词

    from wordcloud import WordCloud
    color_mask =imageio.imread("1.png")
    wc = WordCloud(
            background_color="black",  # 背景颜色
            max_words=500,  # 显示最大词数
            font_path="D:/软件（学习）/Python/PyCharm/font/simsun.ttc",  # 使用字体
            min_font_size=15,
            max_font_size=50,
            width=400,
            height=860,
            mask=color_mask) # 图幅宽度
    i=str('why')
    wc.generate(word_list)
    wc.to_file(str(i)+".png")
jieba_cut()