从丁香园获取疫情的最新资讯

42 篇文章 1 订阅
35 篇文章 5 订阅
本文介绍了一种从特定网站抓取疫情数据的方法,并将其存储到MySQL数据库中。通过Python的requests、BeautifulSoup和pymysql库实现网页数据的获取、解析和存储。文章详细展示了如何抓取疫情地图图片、疫情描述、详细信息及实时播报,同时介绍了如何创建数据库、插入数据及检查最新消息。
摘要由CSDN通过智能技术生成
import requests
import time
import re
from bs4 import BeautifulSoup
import pymysql


class Virus(object):
    def __init__(self):
        super().__init__()
        self.url = "https://3g.dxy.cn/newh5/view/pneumonia"
        self.header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"}

    def get_html(self) -> str:
        """获取网站的html"""
        with requests.session() as s:
            response = s.get(self.url, headers=self.header, timeout=5)
            response.encoding = response.apparent_encoding
            if response:
                # with open("index.html", 'w',encoding='utf-8') as file:
                #     file.write(response.text)
                return response.text

    def covert_timestamp(self, timestamp) -> str:
        """对时间戳进行转化"""
        timestamp = int(timestamp[:-3])  # 这里是因为这个网站中的时间戳后面的三位是可以忽略的
        print(timestamp)
        localtime = time.localtime(timestamp)
        date = time.strftime("%Y-%m-%d %H:%M:%S", localtime)
        return date

    def get_picture(self):
        flag = True
        pattern = re.compile(r'<img class="mapImg___3LuBG" src="(.*?)">', re.S)
        src = re.findall(pattern, self.get_html())

        if not src:  # 如果没有找到
            print("未查找到相关内容,请联系作者更新")
            return

        if not os.path.exists('疫情地图.png'):
            with open("疫情地图.png", "wb") as png:
                png.write(requests.get(src[0]).content)
        else:
            while flag:
                choice = input("原疫情地图已存在,是否覆盖: (y/n)  ")
                if choice in ["Y", "y"]:
                    with open("疫情地图.png", "wb") as png:
                        png.write(requests.get(src[0]).content)
                        flag = False
                elif choice in ["N", 'n']:
                    return
                else:
                    print("输入错误,请重新输入")

    def get_des(self) -> dict:
        pattern = re.compile(r'<div class="mapTop___2VZCl">(.*?)</div>')
        des_div = re.findall(pattern, self.get_html())
        # print(des_div[0])
        soup = BeautifulSoup(des_div[0], 'lxml')
        p_list = soup.findAll('p')
        des_text_list = []
        for p in p_list:
            des_text_list.append(p.get_text())
        return {"description": des_text_list}

    def get_detail(self) -> dict:
        div = BeautifulSoup(self.get_html(), 'lxml').findAll(
            "div", {"class": "descBox___3dfIo"})[0]
        detail_text_list = []
        for p in div.find_all("p"):
            detail_text_list.append(p.get_text())
        return {"detail": detail_text_list}

    def get_broadcast(self) -> list:
        """"获取实时播报"""
        div_list = BeautifulSoup(self.get_html(), 'lxml').find_all(
            "div", {"class": "block___wqUAz"})
        info_list = []
        for div in div_list:

            time = div.find_all("span", {"class": "leftTime___2zf53"})[
                0]
            if time.find("span"):
                time = time.find("span").get_text()
            else:
                time = time.get_text()

            title = div.find_all("p", {"class": "topicTitle___2ovVO"})[
                0].get_text()

            if "最新" in title:
                title = title[2:]

            content = div.findAll("p", {"class": "topicContent___1KVfy"})[
                0].get_text()

            topicFrom = div.find_all("p", {"class": "topicFrom___3xlna"})[
                0].get_text()
            data = {
                "time": time,
                "title": title,
                "content": content,
                "from": topicFrom
            }
            info_list.append(data)
        return info_list

    def get_all_time(self):
        """"获取之前存在数据库中所有的时间数据"""
        connect = pymysql.connect("localhost", 'root', 'Edgar', 'edgar')
        cursor = connect.cursor()
        sql = "SELECT time FROM detail"
        cursor.execute(sql)
        result = cursor.fetchall()
        connect.commit()
        cursor.close()
        connect.close()
        return [i[0] for i in result]

    def check_lastest(self):
        """"判断是否存在最新的消息"""
        new = [i for i in self.get_broadcast()][0]
        new_time = new.get("time")
        if new_time not in self.get_all_time():
            print("有新的实时播报")
            for i in new:
                print(new[i])
            self.insert(new.get("time"), new.get("title"), new.get("content"),
                        new.get("from"))

            print("\n目前 湖南,上海的情况如下:")
            hunan = self.get_city_detail("湖南")
            shanghai = self.get_city_detail("上海")
            print(hunan, shanghai, sep="\n")
        else:
            print("暂无最新消息")

    @staticmethod
    def create_database():
        connect = pymysql.connect("localhost", 'root', 'Edgar', 'edgar')
        cursor = connect.cursor()
        sql = "CREATE TABLE IF NOT EXISTS detail(time TEXT, title TEXT, content TEXT, contentFrom TEXT)"
        cursor.execute(sql)
        connect.commit()
        cursor.close()
        connect.close()

    @staticmethod
    def insert(time, title, content, contentFrom):
        connect = pymysql.connect("localhost", 'root', 'Edgar', 'edgar')
        cursor = connect.cursor()
        sql = 'INSERT INTO detail(time, title,content,contentFrom)\
         VALUES("%s","%s","%s","%s")' % (time, title, content, contentFrom)
        # 需要加引号,否则不能通过
        # cursor.execute(sql, (time, title, content, contentFrom))
        cursor.execute(sql)
        connect.commit()
        cursor.close()
        connect.close()

    def upload_data(self):
        info = self.get_broadcast()
        for data in info[::-1]:
            self.insert(data["time"], data["title"], data["content"],
                        data["from"])

    @staticmethod
    def get_first_data():
        connect = pymysql.connect("localhost", 'root', 'Edgar', 'edgar')
        cursor = connect.cursor()
        sql = "SELECT * FROM detail LIMIT 1"
        data = cursor.execute(sql)
        data = data.fetchall()
        connect.commit()
        cursor.close()
        connect.close()
        print(data)

    def get_city_detail(self, city):
        for detail in self.get_detail()["detail"]:
            if city in detail:
                return detail


if __name__ == "__main__":
	"""just for test"""
    virus = Virus()
    # virus.get_picture()

    # virus.upload_data()
    # city = virus.get_city_detail("湖南")
    # print(city)
    virus.check_lastest()

在获取最新消息的同时,可以发送邮件到自己的邮箱(参考这篇文章)之中,然后将程序在服务其中不断的跑,便可以及时获取网页中最新的消息

!!!目前网站的结构发生变化,此方法不能正确获取信息!!!




更新之后采取下面代码:

import requests
import os
import time
import re
from bs4 import BeautifulSoup
import pymysql
import json


class Virus(object):
    def __init__(self):
        super().__init__()
        self.url = "https://3g.dxy.cn/newh5/view/pneumonia"
        self.header = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.79 Safari/537.36"}
        self._html = None
        self.home_html()

    def get_html(self, url) -> str:
        """获取网站的html"""
        with requests.session() as s:
            response = s.get(url, headers=self.header, timeout=5)
            response.encoding = response.apparent_encoding
            if response:
                return response.text

    def home_html(self):
        """将网页的内容存储起来,方便之后引用"""
        self._html = self.get_html(self.url)

    def get_picture(self):
        flag = True
        pattern = re.compile(r'<img class="mapImg___3LuBG" src="(.*?)">', re.S)
        src = re.findall(pattern, self._html)

        if not src:  # 如果没有找到
            print("未查找到相关内容,请联系作者更新")
            return

        if not os.path.exists('疫情趋势图.png'):
            with open("疫情趋势图.png", "wb") as png:
                png.write(requests.get(src[0]).content)
        else:
            while flag:
                choice = input("原疫情趋势图已存在,是否覆盖: (y/n)  ")
                if choice in ["Y", "y"]:
                    with open("疫情趋势图.png", "wb") as png:
                        png.write(requests.get(src[0]).content)
                        flag = False
                elif choice in ["N", 'n']:
                    return
                else:
                    print("输入错误,请重新输入")

    def get_des(self) -> dict:
        pattern = re.compile(r'<div class="mapTop___2VZCl">(.*?)</div>')
        des_div = re.findall(pattern, self._html)
        # print(des_div[0])
        soup = BeautifulSoup(des_div[0], 'lxml')
        p_list = soup.findAll('p')
        des_text_list = []
        for p in p_list:
            des_text_list.append(p.get_text())
        return {"description": des_text_list}

    def get_area_stat(self):
        pattern = re.compile(r'window.getAreaStat = (.*?)}catch')
        json_text = re.findall(pattern, self._html)[0]
        return json.loads(json_text)

    def upload_area_stat(self) -> None:
        """"将获取到的省份城市的信息存储在数据库中"""
        state = self.get_area_stat()
        for info in state:
            cities = info.popitem()
            self.insert_to_province(info)
            for city in cities[1]:
                city["provinceShortName"] = info.get("provinceShortName")
                self.insert_to_city(city)

    def get_broadcast(self) -> list:
        """"获取实时播报"""
        url = "https://assets.dxycdn.com/gitrepo/bbs-mobile/dist/p__Pneumonia__timeline.async.4363ba04.js"
        html = self.get_html(url)
        pattern = re.compile(r"JSON.parse\('(.*?)'\)}", re.M)
        json_text = re.findall(pattern, html)[0].encode('utf-8').decode(
            "unicode_escape")
        return json.loads(json_text)

    def get_left_broadcast(self):
        pattern = re.compile(r"window.getTimelineService =(.*?)}catch")
        json_text = re.findall(pattern, self._html)[0]
        return json.loads(json_text)

    @staticmethod
    def get_all_id():
        """"获取之前存在数据库中所有的时间数据"""
        connect = pymysql.connect("localhost", 'root', 'Edgar', 'edgar')
        cursor = connect.cursor()
        sql = "SELECT id FROM broadcast"
        cursor.execute(sql)
        result = cursor.fetchall()
        connect.commit()
        cursor.close()
        connect.close()
        return [i[0] for i in result]

    def check_latest(self) -> None:
        """"判断是否存在最新的消息,在这里假设的是处于不断的运行之中,
        所以只需要每次判断第一个便可以了"""
        new = [i for i in self.get_left_broadcast()][0]
        new_time = new.get("id")
        if new_time not in self.get_all_id():
            print("有新的实时播报")
            print("title: {}".format(new.get("title")))
            print("summary: {}".format(new.get("summary")))
            self.insert(new)

            print("\n目前 湖南,上海的情况如下:")
            hunan = self.get_city_detail("湖南")
            shanghai = self.get_city_detail("上海")
            print(hunan, shanghai, sep="\n")
        else:
            print("暂无最新消息")

    @staticmethod
    def create_database() -> None:
        connect = pymysql.connect("localhost", 'root', 'Edgar', 'edgar')
        cursor = connect.cursor()
        sql = "CREATE TABLE IF NOT EXISTS broadcast(id INTEGER NOT NULL , " \
              "pubDate varchar(20) , title TEXT, summary TEXT, infoSource " \
              "varchar(40), sourceUrl TEXT, provinceId varchar(20), " \
              "provinceName varchar(20), createTime varchar(20), modifyTime " \
              "varchar(20)); "
        cursor.execute(sql)

        sql = "create table province(provinceName varchar(20)," \
              "provinceShortName varchar(20),confirmedCount INT," \
              "suspectedCount INTEGER, curedCount INTEGER," \
              "deadCount INTEGER,comment TEXT);"
        cursor.execute(sql)

        sql = "create table city(provinceShortName varchar(20),cityName " \
              "varchar(20),confirmedCount INTEGER,suspectedCount INT," \
              "curedCount INT,deadCount INTEGER); "
        cursor.execute(sql)

        connect.commit()
        cursor.close()
        connect.close()

    def insert(self, info) -> None:
        connect = pymysql.connect("localhost", 'root', 'Edgar', 'edgar')
        cursor = connect.cursor()
        sql = 'INSERT INTO broadcast(id, pubDate, title, summary, infoSource, ' \
              'sourceUrl, provinceId, provinceName, createTime, modifyTime) ' \
              'VALUES("%s","%s","%s","%s","%s","%s","%s","%s","%s","%s")' % (
                  info.get("id"), self.convert_timestamp(info.get("pubDate")),
                  info.get("title"),
                  info.get("summary"),
                  info.get("infoSource"), info.get("sourceUrl"),
                  info.get("provinceId"),
                  info.get("provinceName"),
                  self.convert_timestamp(info.get('createTime')),
                  self.convert_timestamp(info.get("modifyTime")))
        # 需要加引号,否则can not通过
        cursor.execute(sql)
        connect.commit()
        cursor.close()
        connect.close()

    def upload_data(self) -> None:
        info = self.get_broadcast()
        for data in info[::-1]:
            self.insert(data)

    def upload_left_data(self) -> None:
        """"主页和实时播报页面的并没有同步,另外插入"""
        left_info = self.get_left_broadcast()
        id_list = self.get_all_id()
        for info in left_info[::-1]:
            if info.get("id") not in id_list:
                self.insert(info)

    @staticmethod
    def convert_timestamp(timestamp) -> str:
        """对时间戳进行转化"""
        timestamp = timestamp / 1000  # 这里是因为这个网站中的时间戳后面的三位是可以忽略的
        localtime = time.localtime(timestamp)
        date = time.strftime("%Y-%m-%d %H:%M:%S", localtime)
        return date

    def insert_to_province(self, info) -> None:
        connect = pymysql.connect("localhost", 'root', 'Edgar', 'edgar')
        cursor = connect.cursor()
        sql = 'INSERT INTO province(provinceName, provinceShortName, ' \
              'confirmedCount, suspectedCount,curedCount, deadCount, comment) ' \
              'VALUES("%s","%s","%s","%s","%s","%s", "%s")' % (
                  info.get("provinceName"),
                  info.get("provinceShortName"),
                  info.get("confirmedCount"),
                  info.get("suspectedCount"),
                  info.get("curedCount"),
                  info.get("deadCount"),
                  info.get("comment"))
        cursor.execute(sql)
        connect.commit()
        cursor.close()
        connect.close()

    def get_city_detail(self, city) -> str:
        connect = pymysql.connect("localhost", 'root', 'Edgar', 'edgar')
        cursor = connect.cursor()
        sql = 'SELECT * FROM province where provinceShortName="%s"' % city
        cursor.execute(sql)
        result = cursor.fetchone()
        connect.commit()
        cursor.close()
        connect.close()
        result = "在%s 确诊的有 %d 人,疑似 %d 人,死亡 %d 人,成功治愈 %d 人" % (
            result[1], result[2], result[3], result[4], result[6])
        return result

    @staticmethod
    def insert_to_city(info):
        connect = pymysql.connect("localhost", 'root', 'Edgar', 'edgar')
        cursor = connect.cursor()
        sql = 'INSERT INTO city(provinceShortName, cityName, confirmedCount, ' \
              'suspectedCount, curedCount, deadCount) ' \
              'VALUES("%s","%s","%s","%s","%s","%s")' % (
                  info.get("provinceShortName"),
                  info.get("cityName"),
                  info.get("confirmedCount"),
                  info.get("suspectedCount"),
                  info.get("curedCount"),
                  info.get("deadCount"))
        cursor.execute(sql)
        connect.commit()
        cursor.close()
        connect.close()

    def refresh_province_city(self):
        connect = pymysql.connect("localhost", 'root', 'Edgar', 'edgar')
        cursor = connect.cursor()
        sql = 'truncate table city;'
        cursor.execute(sql)
        sql = 'truncate table province;'
        cursor.execute(sql)
        connect.commit()
        cursor.close()
        connect.close()
        self.upload_area_stat()



if __name__ == "__main__":
    virus = Virus()

    # virus.get_picture()

    virus.create_database()

    # print(virus.get_des())

    # print(virus.get_area_stat())

    # virus.upload_area_stat()

    # print(virus.get_broadcast())

    # print(virus.get_left_broadcast())

    # print(virus.get_all_id())

    # virus.check_latest()

    # virus.upload_data()
    # virus.upload_left_data()

    # print(virus.get_city_detail("上海"))

    # virus.check_latest()

    virus.refresh_province_city()



运行程序之后部分结果如下:

provinceNameprovinceShortNameconfirmedCountsuspectedCountcuredCountdeadCountcomment
湖北省湖北54903124
广东省广东53020
浙江省浙江43010
北京市北京36010
重庆市重庆27000
湖南省湖南24000
上海市上海20010
四川省四川15000
山东省山东15000
安徽省安徽15000
广西壮族自治区广西13000
福建省福建10200福建地区新增疑似 2 例(漳州 1 例;三明 1 例)
河南省河南9000
江苏省江苏9000
海南省海南8000
天津市天津8000
江西省江西7000
陕西省陕西5000
黑龙江省黑龙江4001
辽宁省辽宁4000
贵州省贵州3000
吉林省吉林3000
云南省云南2000
宁夏回族自治区宁夏2000
香港香港2000
澳门澳门2000
河北省河北2001
甘肃省甘肃2000
新疆维吾尔自治区新疆2000
台湾台湾1000
山西省山西1000
内蒙古自治区内蒙古1000
青海省青海0100西宁新增疑似 1 例

provinceShortNamecityNameconfirmedCountsuspectedCountcuredCountdeadCount
湖北武汉49503123
湖北孝感22000
湖北黄冈12000
湖北荆州8000
湖北荆门8000
湖北仙桃2000
湖北宜昌1001
湖北十堰1000
广东深圳15020
广东珠海8000
广东佛山7000
广东广州7000
广东惠州5000
广东韶关3000
广东阳江3000
广东湛江2000
广东中山2000
广东肇庆1000
广东清远1000
浙江台州18000
浙江杭州6000
浙江温州6010
浙江宁波5000
浙江嘉兴3000
浙江衢州2000
浙江舟山1000
浙江绍兴1000
浙江金华1000
北京外地来京人员10000
北京海淀6000
北京朝阳5000
北京西城4000
北京昌平3000
北京大兴2010
北京丰台2000
北京通州2000
北京石景山1000
北京顺义1000
重庆万州区3000
重庆巫山县3000
重庆长寿区3000
重庆垫江县2000
重庆永川区2000
重庆九龙坡区2000
重庆渝北区2000
重庆开州区2000
重庆涪陵区1000
重庆大渡口区1000
重庆忠县1000
重庆云阳县1000
重庆奉节县1000
重庆巫溪县1000
重庆秀山县1000
重庆两江新区1000
湖南长沙8000
湖南永州4000
湖南怀化3000
湖南岳阳3000
湖南娄底3000
湖南郴州1000
湖南株洲1000
湖南湘潭1000
四川成都7000
四川广安2000
四川绵阳2000
四川达州1000
四川德阳1000
四川遂宁1000
四川雅安1000
山东青岛4000
山东威海2000
山东临沂2000
山东济南2000
山东烟台2000
山东潍坊1000
山东日照1000
山东济宁1000
安徽合肥6000
安徽六安2000
安徽阜阳2000
安徽滁州1000
安徽亳州1000
安徽安庆1000
安徽池州1000
安徽蚌埠1000
广西北海6000
广西柳州2000
广西桂林2000
广西梧州1000
广西百色1000
广西河池1000
福建福州5000
福建厦门3000
福建泉州1000
福建宁德1000
河南郑州3000
河南巩义2000
河南洛阳1000
河南三门峡1000
河南信阳1000
河南周口1000
江苏南京3000
江苏苏州2000
江苏连云港1000
江苏扬州1000
江苏南通1000
江苏无锡1000
海南海口3000
海南三亚2000
海南万宁2000
海南临高县1000
江西南昌2000
江西抚州1000
江西萍乡1000
江西九江1000
江西新余1000
江西吉安1000
陕西西安2000
陕西咸阳1000
陕西安康1000
陕西延安1000
黑龙江牡丹江1000
黑龙江哈尔滨1000
黑龙江大庆1000
黑龙江绥化1001
辽宁沈阳2000
辽宁大连1000
辽宁朝阳1000
贵州贵阳1000
贵州铜仁1000
贵州黔南州1000
吉林长春1000
吉林吉林1000
吉林松原1000
云南昆明2000
宁夏银川1000
宁夏中卫1000
河北石家庄1000
河北沧州1001
甘肃兰州1000
甘肃白银1000
山西太原1000
内蒙古满洲里1000


最新代码见 GitHub

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值