import pandas as pd
df_news=pd.read_table(r'C:\Users\CDAer\Desktop\data\car.txt',
names=['category','theme','url','content'])
df_news.head(3)
df_news.shape
df_news['category'].value_counts()
财经 500
教育 500
健康 500
军事 500
娱乐 500
科技 500
时尚 500
汽车 500
体育 500
文化 500
Name: category, dtype: int64
#500条娱乐新闻的词云分析
df_relax=df_news[df_news.category=='娱乐']
list_relax=df_relax.content.values.tolist()
jieba分词
import jieba
stopwords=pd.read_csv(r'C:\Users\CDAer\Desktop\data\stopwords.txt',sep='\t',
quoting=3,names=['stopword'])
stopwords_list=stopwords['stopword'].values.tolist()
words=[]
for line in list_relax:
seg=jieba.lcut(line)
for word in seg:
if word =='\n' or len(word)<=1:
continue
elif word in stopwords_list:
continue
else:
words.append(word)
计算词频
df_cloud=pd.DataFrame({'cloud_words':words})
df_cloud.head()
cloud_words | |
---|---|
0 | 网友 |
1 | 出图 |
2 | 显示 |
3 | 陈宝国 |
4 | 妻子 |
count=df_cloud.groupby(by=['cloud_words'])['cloud_words'].count()
import numpy as np
#统计词频
words_count=df_cloud.groupby(by=['cloud_words'])['cloud_words'].agg({'count':np.size})
D:\ProgramData\Anaconda3\lib\site-packages\ipykernel_launcher.py:2: FutureWarning: using a dict on a Series for aggregation
is deprecated and will be removed in a future version
words_count_sort=words_count.reset_index().sort_values(by=['count'],ascending=False)
words_count_sort.head()
cloud_words | count | |
---|---|---|
13659 | 电影 | 443 |
16725 | 观众 | 342 |
6980 | 导演 | 294 |
16074 | 节目 | 259 |
990 | 中国 | 257 |
绘制词云
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
WordCloud?
#配置基本的词云参数
cloud=WordCloud(
font_path='C:/Windows/Fonts/simhei.ttf')
#用处理好的字符+词频数据,生成词云
word_freq={x[0]:x[1] for x in words_count_sort.values}
pic_cloud=cloud.fit_words(word_freq)
#将词云绘制出
plt.imshow(pic_cloud)
plt.axis('off')
plt.show()
词云参数的调整
plt.imshow?
# 'none', 'nearest', 'bilinear', 'bicubic',
# 'spline16', 'spline36', 'hanning', 'hamming', 'hermite', 'kaiser',
# 'quadric', 'catrom', 'gaussian', 'bessel', 'mitchell', 'sinc',
# 'lanczos'
##配置画布的大小
plt.rcParams['figure.figsize']=(10,5)
#配置基本的词云参数
cloud=WordCloud(
font_path='C:/Windows/Fonts/simhei.ttf',
width=1000,
height=500,
background_color='white')
#用处理好的字符+词频数据,生成词云
word_freq={x[0]:x[1] for x in words_count_sort.values}
pic_cloud=cloud.fit_words(word_freq)
#将词云绘制出
plt.imshow(pic_cloud,interpolation='bilinear')
plt.axis('off')
plt.show()
[外链图片转存(img-GqkoZOjI-1562757157121)(output_22_0.png)]
任意形状的词云
from PIL import Image
pic=np.array(Image.open('C:\\Users\\CDAer\\Desktop\\data\\people.jpg'))
WordCloud?
##配置画布的大小
plt.rcParams['figure.figsize']=(10,5)
#配置基本的词云参数
cloud=WordCloud(
font_path='C:/Windows/Fonts/simhei.ttf',
width=1000,
height=500,
background_color='white',
mask=pic)
#用处理好的字符+词频数据,生成词云
word_freq={x[0]:x[1] for x in words_count_sort.values}
pic_cloud=cloud.fit_words(word_freq)
#将词云绘制出
plt.imshow(pic_cloud,interpolation='bilinear')
plt.axis('off')
plt.show()
[外链图片转存(img-lCGJhQyX-1562757157122)(output_26_0.png)]