要求
搜索并下载2021、2022、2023年政府工作报告全文,分别保存为txt文件,读取文件内容并使用wordcloud库绘制词云,分析每年关键词变化。
思路
1.使用爬虫爬取对应网站所需内容,保存在对应年份的txt文件中。
2.读取读取对应年份txt文件,进行文本分词。
3.生成词云图保存
代码
所用库
import jieba
from wordcloud import wordcloud
import matplotlib.pyplot as plt
import requests,re
from lxml import etree
下载保存
def get_content(year):
#获取网页,使用代理IP
hea = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Cookie': 'll="118220"; bid=nhImQgaGStI; __gads=ID=4a4faad9f83aa3a7-2286607bbad600b9:T=1664331434:RT=1664331434:S=ALNI_MbDM9wxTnhSOG3h4AfXB2dg-aO7Xg; _ga=GA1.3.701048773.1664331378; _ga=GA1.1.701048773.1664331378; douban-fav-remind=1; __utma=30149280.701048773.1664331378.1667135775.1671333410.5; __utmz=30149280.1671333410.5.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; _gid=GA1.3.960940529.1671954701; _pk_ses.100001.a7dd=*; _pk_id.100001.a7dd=e5b98469a019b050.1671177881.2.1671954708.1671177896.; __gpi=UID=000009e988d0fb24:T=1664331434:RT=1671954709:S=ALNI_MavO_luwVeIQtO4RAlr3sOW1u-AqQ; _ga_RXNMP372GL=GS1.1.1671954701.2.1.1671954718.43.0.0'
}
url = 'https://www.gov.cn/zhuanti/' + year + 'lhzfgzbg/index.htm'
html = requests.get(url=url, headers=hea).content.decode()
xpath_html = etree.HTML(html)
data_span = xpath_html.xpath('//ul[contains(@class,"addScroll zhj-bgqw")]//span/text()')
data_p = xpath_html.xpath('//ul[contains(@class,"addScroll zhj-bgqw")]//p/text()')
data = ''
for i in data_span:
data = data + i
for i in data_p:
data = data + i + '\n'
#print(data)
with open(year + '.txt', 'w',encoding='utf-8') as f:
f.write(data)
print(year,"年政府工作报告获取完毕")
分词,生成词云图
def word_picture(year):
get_content(year)
text = open((year+'.txt'),encoding='utf-8').read()
# (3)文本分词
word_list = jieba.cut(text) # 返回分词迭代器
word_list_split = ''.join(word_list)
# 生成分词字符串,用空格 (·)分隔
# (4)生成词云
my_word_cloud = wordcloud.WordCloud(font_path='D:\Lenvo\Desktop\simhei.ttf').generate(word_list_split)
plt.imshow(my_word_cloud)
plt.axis('off')
# 去掉坐标轴和标签
plt.show()
# (6)保存词云图片
my_word_cloud.to_file(year + '.jpg')
word_picture('2021')
word_picture('2022')
word_picture('2023')