政府工作报告

import jieba
import requests
from lxml import etree
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

‘’’
1.通过网络数据采集获取政府工作报告(2014-2021年度报告),
并对政府工作报告历年的数据进行存储、数据整理、中文分词、统计词语关联,
并可视化,同时给出政府工作报告重心随时间的转移情况统计。
‘’’
#生成政府工作报告地址url
def SNN_getURL(Sstart, Send):
# 构建url
SS = []
for i in range(Sstart, Send):
URL = “http://www.gov.cn/guowuyuan/{}zfgzbg.htm”.format(i)
SS.append(URL)
return SS

##获取政府工作报告
def SNN_getText(url,session):
DZ = session.get(url)
if DZ.status_code != 200:
return None
DZ.encoding = ‘utf-8’
T = etree.HTML(DZ.text)
Text = T.xpath("//div[@id=‘conlun2_box_text’]/p/text()")
Texts = “”.join(Text)
return Texts

读取停用词文本

def SNN_get_stopwords(SNNfile):
oldwords = []
with open(SNNfile) as file:
oldwords = file.readlines()
youngwords=[]
for i in oldwords:
youngwords.append(i.strip())
return youngwords

#对工作报告进行分词
def SNN_separatewords(txt):
S_ciyuList=jieba.lcut(txt)
return S_ciyuList

过滤停用词

def SNN_Removewords(nn, stopnn):
snn = []
for s in nn:
if s not in stopnn:
snn.append(s)
return snn

#对工作报告进行统计
def SNN_Wordsum(onebyone):
snn={}
for sword in onebyone:
snn[sword]=snn.get(sword, 0)+1
return snn

#对工作报告进行字典排序 取前n个
def SNN_Wordsort_descending(mdict, snn):
s=list(mdict.items())
sResult = sorted(s, key=lambda x: x[1], reverse=True)
return dict(sResult[:snn])

#绘制词云
def SNN_Plot(mdict ,mwith=600,mheight=500):
model =WordCloud(font_path=r’C:\Windows\Fonts\msyh.ttf’,
width=mwith,
height=mheight,
scale=1,
margin=2,
background_color=‘pink’,
max_words=200,
min_font_size=40,
max_font_size=140,
stopwords=STOPWORDS)
cloud=model.generate_from_frequencies(mdict)
plt.imshow(cloud)
plt.show()

separateList = SNN_get_stopwords(“stopwords.txt”)
urls = SNN_getURL(2014, 2022)
session = requests.session()

for s in range(0,8):
sText = SNN_getText(urls[s],session)
sSeparatewords = SNN_separatewords(sText)
sText_result = SNN_Removewords(sSeparatewords, separateList)
sWordsum_result = SNN_Wordsum(sText_result)
sText_top15 = SNN_Wordsort_descending(sWordsum_result, 15)
print("*"*100)
print(sText_top15)
SNN_Plot(sText_top15)

sText = SNN_getText(urls[0],session)
sSeparatewords = SNN_separatewords(sText)
sText_result = SNN_Removewords(sSeparatewords, separateList)
sWordsum_result = SNN_Wordsum(sText_result)
sText1_top15 = SNN_Wordsort_descending(sWordsum_result, 15)

sText = SNN_getText(urls[1],session)
sSeparatewords = SNN_separatewords(sText)
sText_result = SNN_Removewords(sSeparatewords, separateList)
sWordsum_result = SNN_Wordsum(sText_result)
sText2_top15 = SNN_Wordsort_descending(sWordsum_result, 15)

sText = SNN_getText(urls[2],session)
sSeparatewords = SNN_separatewords(sText)
sText_result = SNN_Removewords(sSeparatewords, separateList)
sWordsum_result = SNN_Wordsum(sText_result)
sText3_top15 = SNN_Wordsort_descending(sWordsum_result, 15)

sText = SNN_getText(urls[3],session)
sSeparatewords = SNN_separatewords(sText)
sText_result = SNN_Removewords(sSeparatewords, separateList)
sWordsum_result = SNN_Wordsum(sText_result)
sText4_top15 = SNN_Wordsort_descending(sWordsum_result, 15)

sText = SNN_getText(urls[4],session)
sSeparatewords = SNN_separatewords(sText)
sText_result = SNN_Removewords(sSeparatewords, separateList)
sWordsum_result = SNN_Wordsum(sText_result)
sText5_top15 = SNN_Wordsort_descending(sWordsum_result, 15)

sText = SNN_getText(urls[5],session)
sSeparatewords = SNN_separatewords(sText)
sText_result = SNN_Removewords(sSeparatewords, separateList)
sWordsum_result = SNN_Wordsum(sText_result)
sText6_top15 = SNN_Wordsort_descending(sWordsum_result, 15)

sText = SNN_getText(urls[6],session)
sSeparatewords = SNN_separatewords(sText)
sText_result = SNN_Removewords(sSeparatewords, separateList)
sWordsum_result = SNN_Wordsum(sText_result)
sText7_top15 = SNN_Wordsort_descending(sWordsum_result, 15)

sText = SNN_getText(urls[7],session)
sSeparatewords = SNN_separatewords(sText)
sText_result = SNN_Removewords(sSeparatewords, separateList)
sWordsum_result = SNN_Wordsum(sText_result)
sText8_top15 = SNN_Wordsort_descending(sWordsum_result, 15)

snn1=set(sText1_top15.keys())
snn2=set(sText2_top15.keys())
snn3=set(sText3_top15.keys())
snn4=set(sText4_top15.keys())
snn5=set(sText5_top15.keys())
snn6=set(sText6_top15.keys())
snn7=set(sText7_top15.keys())
snn8=set(sText8_top15.keys())

print("\n")
print("------------------2014-2015作比较----------------------")
print(“差集”,snn2.difference(snn1))
print(“交集”,snn2.intersection(snn1))

print("\n")
print("------------------2015-2016作比较----------------------")
print(“差集”,snn3.difference(snn2))
print(“交集”,snn3.intersection(snn2))

print("\n")
print("------------------2016-2017作比较----------------------")
print(“差集”,snn4.difference(snn3))
print(“交集”,snn4.intersection(snn3))

print("\n")
print("------------------2017-2018作比较----------------------")
print(“差集”,snn5.difference(snn4))
print(“交集”,snn5.intersection(snn4))

print("\n")
print("------------------2018-2019作比较----------------------")
print(“差集”,snn6.difference(snn5))
print(“交集”,snn6.intersection(snn5))

print("\n")
print("------------------2019-2020作比较----------------------")
print(“差集”,snn7.difference(snn6))
print(“交集”,snn7.intersection(snn6))

print("\n")
print("------------------2020-2021作比较----------------------")
print(“差集”,snn8.difference(snn7))
print(“交集”,snn8.intersection(snn7))

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值