1.题目要求
完成至少一个数据爬取的案例(例如:爬取政府工作报告,并生成词云)
2.实现
- 爬政府工作报告
from wordcloud import WordCloud
from bs4 import BeautifulSoup
import jieba
import requests
# 获取网页中的正文文本
def getText(url):
html = requests.get(url).content
bs = BeautifulSoup(html, features="html.parser")
content = bs.find_all('p')
text = ''
for p in content:
text += p.get_text()
text += '\n'
return text
# 词频分析
def getWordFrequency(text):
from collections import Counter
words = [word for word in jieba.cut(text, cut_all=True) if len(word) >= 2]
c = Counter(words)
for word_freq in c.most_common(35):
word, freq = word_freq
print(word, freq)
if __name__ == '__main__':
url = 'http://news.sina.com.cn/c/xl/2019-03-05/doc-ihsxncvf9915493.shtml'
text = getText(url)
# 将爬取到的内容写进txt文件中
f = open(u"政府工作报告.txt", "w", encoding="utf-8")
f.write(text)
f.close()
getWordFrequency(text)
# 词云分析
words = jieba.lcut(text, cut_all=True)
# 忽略的词
exclude_words = ["我们", "提高", "国家", "的", "要", "和", "为", "是", "以",
"随着", "对于", "对", "等", "能", "都", "中", "在", "了", "通常",
"如果", "我", "我国", "他", "就", "着", "什么", "将", "没有",
"到", "这", "也", "不", "与", "让", "更", "把"]
for word in words:
if word in exclude_words:
words.remove(word)
cuted = ' '.join(words)
# 生成词云
wc = WordCloud(font_path='msyh.ttc', width=1000, height=700, background_color='white').generate(cuted)
wc.to_file("wordcloud.png")
运行结果图没了,懒得放了
- 爬疫情数据
import requests
from pyquery import PyQuery as pq
import json
from xlsxwriter import Workbook
from wordcloud import WordCloud
import matplotlib.pyplot as plt
url = "https://ncov.dxy.cn/ncovh5/view/pneumonia"
response = requests.get(url) # 请求
if response.status_code == 200:
response.encoding = "utf-8"
dom = pq(response.content)
# 取出各省的数据,转为json对象
jsonobj = json.loads(dom("script#getAreaStat").text().split(" = ")[1].split("}catch")[0])
province_data = {} # 用来存储数据
# 遍历
for item in jsonobj:
provinceName = item["provinceName"]
confirmedCount = item["confirmedCount"]
province_data[provinceName] = confirmedCount
# 将爬取到的内容写进文件中
with open(u'全国各省疫情数据.txt', 'w', encoding="utf-8") as f:
for k, v in province_data.items():
f.write(f"{k}:{v}\n")
f.close()
# 生成词云图
wc = WordCloud(background_color='white', font_path='C:\Windows\Fonts\simhei.ttf', width=2000, height=1500)
wc.generate_from_frequencies(frequencies=province_data)
# 画图
plt.figure()
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()