安装:
pip install wordcloud(有时会有VC++ 14缺失的问题)
解决方案:
在github下载wordcloud的包(https://github.com/amueller/word_cloud/), 解压缩后,在对应目录下用python setup.py install安装。
使用:
import pandas as pd
import jieba
from langconv import * #汉字繁简体转换,需下载langconv.py和zh_wiki.py文件,并将其放在与代码同一目录下
from tongyici import * #引用自定义同义词字典,tongyici.py需与代码在同一目录下
import re
from collections import Counter #collections模块提供了一些有用的集合类,Counter是一个简单的计数器
#以上用于分词、以下用于词云可视化
import matplotlib.pyplot as plt
from wordcloud import WordCloud
import imageio
#分词
# 转换繁体到简体
def fan_to_jian(line):
line = Converter('zh-hans').convert(line)
line.encode('utf-8')
return line
# 添加自定义分词
def add_word(list):
for i in list:
jieba.add_word(i)
# 删除自定义分词
def del_word(list):
for i in list:
jieba.del_word(i)
# 用jieba对商品名称进行文本分析
add_ci = ['3d max', 'altium designer', 'mac os', 'spring boot', 'android studio', 'deep learning ai', 'machine learning', 'visual c++', 'visual studio', 'cloud docker', 'atey ghalian', 'john park']
del_ci = ['视频教程', '自学教材']
add_word(add_ci)
del_word(del_ci) #或jieba.add_word('视频教程', 0)动态修改词典,使之分成视频和教程2个词
jieba.re_han_default = re.compile('(.+)', re.U) #用于解决中间带空格或其他特殊符号的单词分成多个词的问题
jieba.load_userdict('c:/ProgramData/Anaconda3/Lib/site-packages/jieba/mydict.txt') #导入自定义字典
file = pd.read_excel(r'c:\users\administrator\desktop\商品信息v2.0版.xlsx') #读取本地文件
title = list(file['商品名称']) #商品名称转化成list
title_s = [] #商品名称分词list
for line in title:
text = fan_to_jian(line).lower()#将繁体字转换成简体字并将大写字母转换成小写
title_cut = jieba.lcut(text) #分词成list,lcut()函数返回list
#替换同义词
new_cut = [] #替换同义词后的新数据
for fenci in title_cut: #替换同义词
if fenci in dian: #判断是否在字典中
val = dian[fenci] #获取同义词的统一词
new_cut.append(val)
else:
new_cut.append(fenci)
title_s.append(new_cut)
# print(title_s)
#导入停用词
stopwords = ['教程'] #停用词list,可将停用词表中没有的先加入进去
for line in open(r'e:/Python/mypy/stopwords/中英文.txt', 'r+', encoding='utf-8'):
stopwords.append(line.strip())#将停用词字典内容写入停用词list中备用
#剔除商品名称分词中的停用词
title_clear = [] #剔除停用词后的商品名称分词
for line in title_s:
line_clear = [] #每个商品剔除停用词后的商品名称分词
for word in line:
if word not in stopwords and word.strip(): #判断分词不在停用词字典中,并且不为空
line_clear.ap