Python课设【机械学院】

1.题目要求

        完成至少一个数据爬取的案例(例如:爬取政府工作报告,并生成词云)

2.实现

  • 爬政府工作报告
from wordcloud import WordCloud
from bs4 import BeautifulSoup
import jieba
import requests


# 获取网页中的正文文本
def getText(url):
    html = requests.get(url).content
    bs = BeautifulSoup(html, features="html.parser")
    content = bs.find_all('p')
    text = ''
    for p in content:
        text += p.get_text()
        text += '\n'
    return text


# 词频分析
def getWordFrequency(text):
    from collections import Counter
    words = [word for word in jieba.cut(text, cut_all=True) if len(word) >= 2]
    c = Counter(words)
    for word_freq in c.most_common(35):
        word, freq = word_freq
        print(word, freq)


if __name__ == '__main__':
    url = 'http://news.sina.com.cn/c/xl/2019-03-05/doc-ihsxncvf9915493.shtml'
    text = getText(url)

    # 将爬取到的内容写进txt文件中
    f = open(u"政府工作报告.txt", "w", encoding="utf-8")
    f.write(text)
    f.close()

    getWordFrequency(text)

    # 词云分析
    words = jieba.lcut(text, cut_all=True)
    # 忽略的词
    exclude_words = ["我们", "提高", "国家", "的", "要", "和", "为", "是", "以",
                     "随着", "对于", "对", "等", "能", "都", "中", "在", "了", "通常",
                     "如果", "我", "我国", "他", "就", "着", "什么", "将", "没有",
                     "到", "这", "也", "不", "与", "让", "更", "把"]
    for word in words:
        if word in exclude_words:
            words.remove(word)
    cuted = ' '.join(words)

    # 生成词云
    wc = WordCloud(font_path='msyh.ttc', width=1000, height=700, background_color='white').generate(cuted)
    wc.to_file("wordcloud.png")

        运行结果图没了,懒得放了

  • 爬疫情数据
import requests
from pyquery import PyQuery as pq
import json
from xlsxwriter import Workbook
from wordcloud import WordCloud
import matplotlib.pyplot as plt

url = "https://ncov.dxy.cn/ncovh5/view/pneumonia"
response = requests.get(url)  # 请求
if response.status_code == 200:
    response.encoding = "utf-8"
    dom = pq(response.content)
    # 取出各省的数据,转为json对象
    jsonobj = json.loads(dom("script#getAreaStat").text().split(" = ")[1].split("}catch")[0])

province_data = {}  # 用来存储数据
# 遍历
for item in jsonobj:
    provinceName = item["provinceName"]
    confirmedCount = item["confirmedCount"]
    province_data[provinceName] = confirmedCount

# 将爬取到的内容写进文件中
with open(u'全国各省疫情数据.txt', 'w', encoding="utf-8") as f:
    for k, v in province_data.items():
        f.write(f"{k}:{v}\n")
f.close()

# 生成词云图
wc = WordCloud(background_color='white', font_path='C:\Windows\Fonts\simhei.ttf', width=2000, height=1500)
wc.generate_from_frequencies(frequencies=province_data)
# 画图
plt.figure()
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值