python做词云做文本处理_Python文本处理: 分词和词云图

最新推荐文章于 2021-03-26 02:53:19 发布

weixin_39846364

最新推荐文章于 2021-03-26 02:53:19 发布

阅读量256

点赞数

文章标签： python做词云做文本处理

'''

import os

import jieba # 分词包

import numpy # numpy计算包

import codecs # codecs提供open方法指定打开的文件的语言编码，它会在读取时自动转换为内部的unicode

import pandas # 统计学工具包

import matplotlib.pyplot as plt

from wordcloud import WordCloud, ImageColorGenerator # 词云包

from scipy.misc import imread

from time import sleep

def join_txt():

# ---- 合并txt文件

# 获取目标文件夹的路径

meragefiledir = os.getcwd() + '\\corpus'

# 获取当前文件夹中的文件名称列表

filenames = os.listdir(meragefiledir)

# 打开当前目录下的result.txt文件，如果没有则创建

file = open('all_result.txt', 'w')

# 向文件中写入字符先遍历文件名

for filename in filenames:

filepath = meragefiledir + '\\'

filepath = filepath + filename

# 遍历单个文件，读取行数

for line in open(filepath,encoding='utf-8'):

file.writelines(line)

file.write('\n')

file.close()

def make_pic():

# 导入文本，分词处理

file = codecs.open(u'all_result.txt', 'r')

content = file.read()

file.close()

segment = []

segs = jieba.cut(content) # 使用jieba分词

for seg in segs:

if len(seg) > 1 and seg != '\r\n':

segment.append(seg)

# 去停用词(文本去噪)

words_df = pandas.DataFrame({'segment': segment})

words_df.head()

stopwords = pandas.read_csv("stopword.txt", index_col=False,

quoting=3, sep='\t', names=['stopword'], encoding="utf8")

words_df = words_df[~words_df.segment.isin(stopwords.stopword)]

# print(words_df.head(6))

# 词汇频率表

words_stat = words_df.groupby(by=['segment'])['segment'].agg({"count": numpy.size})

words_stat = words_stat.reset_index().sort_values(by="count", ascending=False)

# 自定义词云背景

bimg = imread('mangguo.png')

wordcloud = WordCloud(background_color="white", mask=bimg, font_path='msyh.ttf')

wordcloud = wordcloud.fit_words(dict(words_stat.head(990000).itertuples(index=False)))

# 从背景图片生成颜色值

bimgColors = ImageColorGenerator(bimg)

plt.axis("off")

plt.imshow(wordcloud.recolor(color_func=bimgColors))

# plt.show()

wordcloud.to_file( "ciyun.png")

if __name__ == '__main__':

join_txt()

sleep(2)

print('txt 文件整合完成！----')

make_pic()

print(' 词云图片生成完成-----ciyun.png ')

'''

需要注意：

wordcloud = wordcloud.fit_words(dict(words_stat.head(990000).itertuples(index=False)))

这里接受的是一个 dict类型

weixin_39846364

关注

0
点赞
踩
1

收藏

觉得还不错? 一键收藏
0
评论
python做词云做文本处理_Python文本处理: 分词和词云图

'''import osimport jieba # 分词包import numpy # numpy计算包import codecs # codecs提供open方法指定打开的文件的语言编码，它会在读取时自动转换为内部的unicodeimport pandas # 统计学工具包import matplotlib.pyplot as pltfrom wordcloud import Word...
复制链接

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。