importrequestsimportrefrom bs4 importBeautifulSoupfrom datetime importdatetimeimportpandasimportsqlite3importjiebafrom wordcloud importWordCloudimportmatplotlib.pyplot as plt
url= "http://www.xinhuanet.com/"f=open("css.txt","w+")
res0=requests.get(url)
res0.encoding="utf-8"soup= BeautifulSoup(res0.text,"html.parser")
newsgroup=[]for news in soup.select("li"):if len(news.select("a"))>0:print(news.select("a")[0].text)
title=news.select("a")[0].text
f.write(title)
f.close()
f0= open('css.txt','r')
qz=[]
qz=f0.read()
f0.close()print(qz)
words=list(jieba.cut(qz))
ul={':','的','"','、','”','“','。','!',':','?',' ','\u3000',',','\n'}
dic={}
keys= set(words)-ulfor i inkeys:
dic[i]=words.count(i)
c=list(dic.items())
c.sort(key=lambda x:x[1],reverse=True)
f1= open('diectory.txt','w')for i in range(10):print(c[i])for words_count in range(c[i][1]):
f1.write(c[i][0]+' ')
f1.close()
df=pandas.DataFrame(words)print(df.head())
with sqlite3.connect('newsdb3.sqlite') as db:
df.to_sql('newsdb3',con =db)
f3= open('diectory.txt','r')
cy_file=f3.read()
f3.close()
cy=WordCloud().generate(cy_file)
plt.imshow(cy)
plt.axis("off")
plt.show()