文本信息获取与词频统计
内容:
基于正则表达式获取网站网页的部分文字信息,进行词频统计,绘制词云图。
源代码:
import re
from urllib.request import urlopen
import jieba
import wordcloud
import matplotlib.pyplot as plt
startUrl = r'http://ccs.snnu.edu.cn/xygk/lsyg1.htm'
with urlopen(startUrl) as fp:
content = fp.read().decode()
print(content)
pattern = re.compile(u'<p.*?<span style="background.*?>(.*?)</span>.*?' +
'<span style="background.*?>:(.*?)</span>(.*?)</span></p>', re.I)
result = re.findall(pattern, content)
print(result)
file_test = open('test_example', 'w', encoding='utf-8')
result_str = ""
for item in result:
print(item[0], item[1], item[2])
result_str += ''.join(item)
file_test.write('时间:' + item[0] + '\r\n')
file_test.write('事件:' + item[1] + item[2] + '\r\n')
file_test.close()
words = jieba.lcut(result_str)
words_str = ' '.join(words)
stat_dict = {}
for element in words:
stat_dict[element] = stat_dict.get(element, 0) + 1
print(stat_dict)
wc = wordcloud.WordCloud(
r'C:\\windows\\fonts\\simfang.ttf', width=500, height=400,
background_color='white', font_step=3,
random_state=False, prefer_horizontal=0.9)
craw_stat = wc.generate(words_str)
craw_stat.to_image().save('craw_stat.png')
plt.imshow(wc)
plt.axis('off')
plt.show()
运行结果:![](https://i-blog.csdnimg.cn/blog_migrate/2b4e29aa3ed4cd6eb93bf2406f37f50e.png)