# -*- coding: utf-8 -*-
"""
Created on Wed Apr 22 21:22:42 2020
@author: Administrator
"""
# -*- coding: utf-8 -*-
"""
Created on Wed Apr 22 18:08:04 2020
@author: Administrator
"""
import os
import jieba
from collections import Counter
from wordcloud import WordCloud
import numpy as np
from PIL import Image
import string
#去除标点符号
#打开文件,读取文字内容
class Wordcloud():
def __init__(self,
path,
back_coloring_path,
save_path,
width,
height,
max_words,
min_length,
stop_words,
background_color = 'white',
font_path = "simhei.ttf",
cut_all = True,
):
self.path = path
self.save_path = save_path
self.back_coloring_path = np.array(Image.open(back_coloring_path))
self.width = width
self.height = height
self.stop_words = stop_words
self.cut_all = cut_all
self.max_words = max_words
self.font_path = font_path
self.background_color = background_color
self.min_length = min_length
#去除标点符号
def replace_punctutation(self,word):
#定义中文标点符号
punctutations = ['【','】','《','》',':',',','(',')','、','!','?','“','”',"。",".","/","%",";"]
#去除中文标点符号
for i in punctutations:
word.replace(i, "")
#去除英文标点符号
for j in string.punctuation:
word.replace(i, "")
return word
#打开文件,读取文字内容
def __open_split_file(self, path):
#合并所有txt文件内容
file_path = self.join_txt(path)
file_words = self.open_file(file_path)
return self.__seg_words(file_words)#调用__seg_words方法
#打开文件
def open_file(self,path):
file_words = []
with open(path,'r', encoding='gbk') as f:
for i in f.readlines():#逐行读入到file_words中
i = ''.join(i.split())
file_words.append(i)
return file_words
#进行分词
def __seg_words(self,file_words):
seg_lists = []
for i in file_words:
seg_list = jieba.cut(i,cut_all = self.cut_all)#进行分词,cut_all=True的话分词更精细一些
seg_lists.extend(seg_list)
#遍历每一行
results = []
for result in seg_lists:
result = self.replace_punctutation(result)#去除标点符号
if self.stop_words:
result = self.delet_words(result) #删除指定词
results.append(result)
if "" in results:
results.remove('')#去除空值
return results
def delet_words(self,word):
for i in self.stop_words: #停用词语
if i in word:
word = ""
return word
#统计词频
def __count_words(self,results):
counter = Counter()#初始化一个计数器
for word in results:
if len(word) >= self.min_length and word != '\n':
counter[word] += 1
return counter
def word_cloud_and_count_words(self):
"""
生成词云图片
统计词频
"""
words = self.__open_split_file(self.path)#调用open_split_file方法,进行分词
#统计词频
counter = self.__count_words(words)#调用count_words方法
text = ' '.join(words)
wordcloud = WordCloud( #初始化词云对象
background_color = self.background_color,
width = self.width,
height= self.height,
margin = 2,
max_words = self.max_words,
mask = self.back_coloring_path,
font_path = self.font_path,
random_state= 100) #设置随机生成多少种配色方案
#调用生成词云图方法
word_cloud = wordcloud.generate(text)
#转换成图片
image = word_cloud.to_image()
#保存图片
image.save(self.save_path)
return counter,image
def join_txt(self,path):
'''
合并所有的txt文件并写入到一个文件中
'''
#找出当前目录下所有的文件
file_names = os.listdir(path)
file = open(os.path.join(path,'results.txt'), 'w')
for file_name in file_names:
if ".txt" not in file_name: #如果后缀不是.txt,直接结束当前迭代,进入下一次迭代
continue
else:
file_path = os.path.join(path,file_name)
#遍历单个文件,读取每行内容
for line in open(file_path, encoding='utf-8'):
file.writelines(line)
file.write('\n') #每个txt文件的内容用回车键隔开
file.close()
#results文件的绝对路径
path = os.path.join(path,'results.txt')
return path
if __name__ == "__main__":
#参数
args = {
"path" : "D:\\cloud_words\\", # 文本所在的文件夹相对路径
"back_coloring_path" : "D:\\cloud_words\\爱心.png", #背景图路径
"save_path": "D:\\cloud_words\\词云图.png", #词云图文件保存路径
"background_color" : "white", #词云背景图颜色,默认是白色
"font_path" : "simhei.ttf", #字体,默认是黑体
"cut_all" : False, #是否全分,默认是True
"width" : 300, #词云图的宽度
"height" : 400, #词云图的高度
"max_words" : 100,#最多显示多少词
"min_length":2,#词语最短长度
"stop_words":None #停用词语,直接往里添加list
}
word_cloud = Wordcloud(**args)
counter, image = word_cloud.word_cloud_and_count_words()
print(counter)
image.show()
文本内容
假如你不够快乐
作者:汪国真
也不要把眉头深锁
人生本来短暂
为什么 还要栽培苦涩
打开尘封的门窗
让阳光雨露洒遍每个角落
走向生命的原野
让风儿熨平前额
博大可以稀释忧愁
深色能够覆盖浅色
背景图
最终效果如下
Counter({'假如': 1, '不够': 1, '快乐': 1, '作者': 1, '汪国真': 1, '不要': 1, '眉头': 1, '深锁': 1, '人生': 1, '本来': 1, '短暂': 1, '为什么': 1, '还要': 1, '栽培': 1, '苦涩': 1, '打开': 1, '尘封': 1, '门窗': 1, '阳光雨露': 1, '洒遍': 1, '每个': 1, '角落': 1, '走向': 1, '生命': 1, '原野': 1, '风儿': 1, '熨平': 1, '前额': 1, '博大': 1, '可以': 1, '稀释': 1, '忧愁': 1, '深色': 1, '能够': 1, '覆盖': 1, '浅色': 1})