# -*- coding: utf-8 -*-
"""
Spyder Editor
This is a temporary script file.
"""
#
#1.通过网络数据采集获取政府工作报告(2014-2021年度报告),
#并对政府工作报告历年的数据进行存储、数据整理、中文分词、统计词语关联,并可视化,
#同时给出政府工作报告重心随时间的转移情况统计。
#导入必要的包
import requests
from lxml import etree
import jieba
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
#生成政府工作报告地址url
def genURLS(startyear,endyear):
urls=[]
for i in range(startyear,endyear):
url="http://www.gov.cn/guowuyuan/{}zfgzbg.htm".format(i)
urls.append(url)
return urls
#获取政府工作报告
def getTxtByUrl(url,session):
resp = session.get(url)
if resp.status_code != 200:
return None
resp.encoding='utf-8'
dom =etree.HTML(resp.text)
txtList=dom.xpath("//div[@id='conlun2_box_text']/p/text()")
content = "".join(txtList)
return content
#对政府工报告进行存储
def getStopWord(filePath,words):
with open(filePath,'w') as file:
file.write(words)
newwords = "".join(words)
return newwords
#一次额词语
def getStopWords(filePath):
words=None
with open(filePath) as file:
words = file.readlines()
newwords=[]
for word in words:
newwords.append(word[:-1])
return newwords
#对工作报告进行分词和统计
#中文分词
def cutwords(txt):
a=[]
txtList=jieba.cut(txt)
for i in txtList:
print(i)
a.append(i)
return a
#删除一次额词语
def dropWords(wordLists,dropwordList):
tmpList=[]
for word in wordLists:
if word not in dropwordList:
tmpList.append(word)
return tmpList
#对工作报告进行统计
def wordCount(wordlist):
mdict={}
for word in wordlist:
mdict[word]=mdict.get(word, 0)+1
return mdict
#打印出前top10词
def getTopNword(mdict,n):
tml=list(mdict.items())
tmpResult = sorted(tml,key=lambda x:x[1],reverse=True)
return dict(tmpResult[:n])
def plotWordCount(mdict ,mwith=600,mheight=500):
model =WordCloud(font_path=r'C:\Windows\Fonts\simfang.ttf',
width=mwith,
height=mheight,
scale=1,
margin=2,
background_color='white',
max_words=200,
min_font_size=40,
max_font_size=140,
stopwords=STOPWORDS)
cloud=model.generate_from_frequencies(mdict)
plt.imshow(cloud)
plt.show()
#求每年政府工作报告的
def countWords(txt,excutions=[]):
result = jieba.cut(txt)
mdict={}
#开始循环处理
for iteration in result:
mdict[iteration]=mdict.get(iteration, 0)+1
#删除不要的元素
for it in excutions:
del mdict[it]
return mdict
txt={}
tList={}
result={}
wordsdict={}
topwords={}
urls = genURLS(2014, 2022)
session = requests.session()
filepath="D:/stopwords.txt"
for i in range(len(urls)):
txt[i]=getTxtByUrl(urls[i],session)
# print(txt[i])
print("---------------------")
year=urls[i][-14:-10]
dowload=f'D:/{year}政府工作报告.txt'
tList[i]=cutwords(txt[i])#中文分词
dropwordList=getStopWords("D:/stopwords.txt")#删除一次额词语
result[i]=dropWords(tList[i],dropwordList)
wordsdict[i] = wordCount(result[i])
topwords[i]=getTopNword(wordsdict[i],10) #打印出前top10词
#存储工作报告,并打印
print(getStopWord(dowload,txt[i])) #存储工作报告
for i in range(len(urls)):
plotWordCount(topwords[i])
#####通过集合运算完成集合的交合差
t={}
m={}
d_colloct={}
mdiff={}
for i in range(len(topwords)):
d_colloct[i]=set(topwords[i].keys())
for i in range(len(topwords)-1):
mdiff[i] = d_colloct[i+1].difference(d_colloct[i])
print(mdiff[i])
#通过集合运算完成集合的交集
intersetion={}
for i in range(len(d_colloct)-1):
intersetion[i]=d_colloct[i].intersection(d_colloct[i+1])##求交集,在d_colloct[i]中同时也在d_colloct[i+1]中的数据
# print(intersetion[i])
fenciRest1={}
fenci1={}
fenciRest={}
gaigecount=[]
for i in range(len(txt)):
fenci1[i] =cutwords(txt[i])
fenciRest[i]=wordCount(fenci1[i])
gaigecount.append(fenciRest[i]['创新'])
plt.rcParams['font.sans-serif']=['SimHei']
x_data=['2014','2015','2016','2017','2018','2019','2020','2021']
y_data=gaigecount
plt.bar(x=x_data,height=y_data,label='创新',alpha=0.8)
plt.title("'创新'在每年提及次数")
plt.xlabel("年份")
plt.ylabel("次数")
plt.show()