疫情实时数据爬取（&可视化）

最新推荐文章于 2022-11-01 10:33:43 发布

是璇子鸭

最新推荐文章于 2022-11-01 10:33:43 发布

阅读量2.5k

点赞数 2

分类专栏： Python 文章标签：可视化正则表达式 python 爬虫

本文链接：https://blog.csdn.net/qq_45060674/article/details/118680566

版权

Python 专栏收录该内容

19 篇文章 3 订阅

订阅专栏

中国疫情实时数据爬取

import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
url = 'https://ncov.dxy.cn/ncovh5/view/pneumonia?from=timeline&isappinstalled=0'
headers = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
    }
#省级正则表达式
provinceName_re = re.compile(r'"provinceName":"(.*?)",')
provinceShortName_re = re.compile(r'"provinceShortName":"(.*?)",')
currentConfirmedCount_re = re.compile(r'"currentConfirmedCount":(.*?),')
confirmedCount_re = re.compile(r'"confirmedCount":(.*?),')
suspectedCount_re = re.compile(r'"suspectedCount":(.*?),')
curedCount_re = re.compile(r'"curedCount":(.*?),')
deadCount_re = re.compile(r'"deadCount":(.*?),')
comment_re = re.compile(r'"comment":"(.*?)",')
locationId_re = re.compile(r'"locationId":(.*?),')
statisticsData_re = re.compile(r'"statisticsData":"(.*?)",')
cities_re = re.compile(r'"cities":\[\{(.*?)\}\]')

#市级正则表达式
cityName_re = re.compile(r'"cityName":"(.*?)",')
currentConfirmedCount_1_re = re.compile(r'"currentConfirmedCount":(.*?),')
confirmedCount_1_re = re.compile(r'"confirmedCount":(.*?),')
suspectedCount_1_re = re.compile(r'"suspectedCount":(.*?),')
curedCount_1_re = re.compile(r'"curedCount":(.*?),')
deadCount_1_re = re.compile(r'"deadCount":(.*?),')
locationId_1_re = re.compile(r'"locationId":(.*?)\},')

#爬虫爬取数据
datas = requests.get(url,headers = headers)
datas.encoding = 'utf-8'
soup = BeautifulSoup(datas.text,'lxml')
data = soup.find_all('script',{'id':'getAreaStat'}) # 网页检查定位
data = str(data)
data_str = data[54:-23]
#print(data_str)

#替换字符串内容，避免重复查找
citiess = re.sub(cities_re,'8888',data_str)
#查找省级数据
provinceNames = re.findall(provinceName_re,citiess)
provinceShortNames = re.findall(provinceShortName_re,citiess)
currentConfirmedCounts = re.findall(currentConfirmedCount_re,citiess)
confirmedCounts = re.findall(confirmedCount_re,citiess)
suspectedCounts = re.findall(suspectedCount_re,citiess)
curedCounts = re.findall(curedCount_re,citiess)
deadCounts = re.findall(deadCount_re,citiess)
comments = re.findall(comment_re,citiess)
locationIds = re.findall(locationId_re,citiess)
statisticsDatas = re.findall(statisticsData_re,citiess)


#查找市级数据
citiess_str1 = re.findall(cities_re,data_str)
#将市级列表数据转为字符串，方便正则表达式查找
citiess_str = str(citiess_str1)
cityName = re.findall(cityName_re,citiess_str)
currentConfirmedCount_1 = re.findall(currentConfirmedCount_1_re,citiess_str)
confirmedCount_1 = re.findall(confirmedCount_1_re,citiess_str)
suspectedCount_1 = re.findall(suspectedCount_1_re,citiess_str)
curedCount_1 = re.findall(curedCount_1_re,citiess_str)
deadCount_1 = re.findall(deadCount_1_re,citiess_str)

# 省级数据转换为pandas数组
df = {
    '地区代码':pd.Series(locationIds),
    '省':pd.Series(provinceNames),
    '省区短名':pd.Series(provinceShortNames),
    '当前确诊':pd.Series(currentConfirmedCounts),
    '累计确诊':pd.Series(confirmedCounts),
    '疑似确诊':pd.Series(suspectedCounts),
    '治愈人数':pd.Series(curedCounts),
    '死亡人数':pd.Series(deadCounts),
    '评论':pd.Series(comments),
    '统计数据区':pd.Series(statisticsDatas),
}
pds = pd.DataFrame(df)

# 市级数据转换为pandas数组
df2 = {
    '城市名':pd.Series(cityName),
    '当前确诊':pd.Series(currentConfirmedCount_1),
    '累计确诊':pd.Series(confirmedCount_1),
    '疑似确诊':pd.Series(suspectedCount_1),
    '治愈人数':pd.Series(curedCount_1),
    '死亡人数':pd.Series(deadCount_1),
}
pdc = pd.DataFrame(df2)
# print(pdc)

# 数据保存
pds.to_csv('templates\china_cor_data.csv',index=True) #省级数据
pdc.to_csv('templates\cities_cor_data.csv',index=True) #市级数据

# 数据分析.(因爬虫部分已整理分组，故此处直接进行统计分析)
df1 = pd.read_csv('templates\china_cor_data.csv')
df2 = pd.read_csv('templates\cities_cor_data.csv')

#1 省份数据分析
import numpy as np
data1 = df1[['省区短名','当前确诊','累计确诊','治愈人数']]
analysis1 = data1[['当前确诊','累计确诊','治愈人数']].describe().loc[['mean','std','min','max'],:]
analysis1.to_csv(r'templates\analysis1.csv',index=True) #省级分析结果

#2 城市数据分析
data2 = df2[['城市名','当前确诊','累计确诊','治愈人数']]
analysis2 = data2[['当前确诊','累计确诊','治愈人数']].describe().loc[['mean','std','min','max'],:]
analysis2.to_csv(r'templates\analysis2.csv',index=True) #城市分析结果（含境外）


# 数据可视化
# 1.各省份情况
data2 = pds.loc[:, ['省区短名','当前确诊']]
df_list = data2.values.tolist()
# print(df_list)

from pyecharts import options as opts
from pyecharts.charts import Geo
from pyecharts.globals import ChartType

def china_map_1():
    c = (
        Geo()
        .add_schema(maptype="china")
            .add(
            "geo",
            df_list,
            type_=ChartType.HEATMAP,
        )
        .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
        .set_global_opts(
            visualmap_opts=opts.VisualMapOpts(),
            title_opts=opts.TitleOpts(title="国内省份疫情实况"),
            toolbox_opts=opts.ToolboxOpts(
                                # 是否显示该工具
                                is_show=True,
            ),
        )
        .render("templates\map_visualmap_china.html") #分布可视化
    )

from pyecharts.charts import Bar
def analysis1_bar():
    c = (
        Bar()
        .add_xaxis(['当前确诊','累计确诊','治愈人数'])
        .add_yaxis("平均值",analysis1.loc['mean', :].values.tolist(), stack="stack1")
        .add_yaxis("最大值",analysis1.loc['max',:].values.tolist(), stack="stack2")
        .set_series_opts(
                label_opts=opts.LabelOpts(is_show=False),
                markline_opts=opts.MarkLineOpts(
                    data=[
                        opts.MarkLineItem(type_="max", name="最大值"),
                        opts.MarkLineItem(type_="average", name="平均值"),
                    ]
                ),
            )
        .set_global_opts(title_opts=opts.TitleOpts(title="省级.简单统计结果"))
        .render(r"templates\analysis1_bar_stack.html") #分析结果可视化1
    )

美国及全球疫情实时数据爬取

import requests
import urllib3
import re
from bs4 import BeautifulSoup
import csv
import pandas as pd
import numpy as np

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
    "Referer": "http://m.sinovision.net",
    'Host': 'm.sinovision.net',
}

# res = requests.get('https://voice.baidu.com/act/newpneumonia/newpneumonia#tab4',headers=headers)
# print(res.status_code)

url = 'http://m.sinovision.net/newpneumonia.php' #请求地址

global datas
datas = []
def statistic(url):
    response  = requests.get(url, headers=headers)
    data = response.content.decode('utf-8')
    #print(data)
    soup = BeautifulSoup(data, 'html5lib')
    count = 0; global datas
    for tag in soup.find_all('div', class_='main-block'):
        country = tag.find('span', attrs={"class": "area"}).get_text() #国家
        cer = tag.find('span',class_='confirm').get_text() #确诊数
        death = tag.find('span',class_='dead').get_text() #死亡数
        cure = tag.find('span',class_='cured-notag') #治愈数
        if cure is None:
            cure = 0
        else:
            cure = tag.find('span',class_='cured-notag').get_text()
        data = {"国家": country, "确诊": cer, '死亡': death, '治愈':cure}
        datas.append(data)
        # print(datas)

statistic(url)

# 因为两张表的class标签非常相似，所以这里利用索引来区分单条数据，以判别其所属对象,共有279条数据
for index,data in enumerate(datas):
    if data['国家'] == 'US':
        print(index) # 57,即 datas[57]及其往后为全球疫情数据集，其前为美国各州数据集
    else:
        continue

US_datasets = datas[:56]
global_datasets = datas[57:]

#print(US_datasets)
#print(global_datasets)

# 保存数据
name = ["国家", "确诊", "死亡", "治愈"]
usdata = pd.DataFrame(columns=["国家", "确诊", "死亡"], data=US_datasets)
# print(usdata)
usdata.to_csv(r'templates\usdata.csv')

globaldata = pd.DataFrame(columns=name, data=global_datasets)
globaldata.to_csv('templates\globaldata.csv')

# 数据分析
df = pd.read_csv(r'templates\usdata.csv')

result2 = [
    [
        value['国家'],
        value['确诊']
    ]
        for index,value in df.iterrows()
]

result3 = [
    [
        value['国家'],
        value['死亡']
    ]
        for index,value in df.iterrows()
]

x_data = np.mean([item[1] for item in result2])
x_data2 = np.median([item[1] for item in result2])
y_data = np.mean([item[1] for item in result3])
y_data2 = np.median([item[1] for item in result3])
# print("确诊人数：",x_data,"死亡人数：",y_data)
str1="平均确诊"+str(round(x_data,2))+'\n'+'\n'+"中位数"+str(round(x_data2,2))
str2="平均死亡"+str(round(y_data,2))+'\n'+'\n'+"中位数"+str(round(y_data2,2))
#print(str1)
#print(str2)

# 可视化
from pyecharts import options as opts
from pyecharts.charts import Map
from pyecharts.globals import  ThemeType
from pyecharts.charts import  Grid
import pyecharts.options as opts
from pyecharts.charts import Line

def qz():
    c = (
        Map(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
        # 具体参考 pyecharts.datasets.map_filenames.json 文件
        .add("确诊人数",result2, "美国")
        .set_global_opts(
            title_opts=opts.TitleOpts(title="美国确诊人数",subtitle=str1),
            visualmap_opts=opts.VisualMapOpts(max_=4000000)

        )
        .render("templates\确诊人数.html")
    )

def sw():
    d = (
        Map(init_opts=opts.InitOpts(theme=ThemeType.LIGHT))
        # 具体参考 pyecharts.datasets.map_filenames.json 文件
        .add("死亡人数",result3, "美国")
        .set_global_opts(
            title_opts=opts.TitleOpts(title="死亡人数",subtitle=str2),
            visualmap_opts=opts.VisualMapOpts(max_=65000)
        )
        .render("templates\death.html")
    )

是璇子鸭

关注

2
点赞
踩
38

收藏

觉得还不错? 一键收藏
0
评论
疫情实时数据爬取（&可视化）

import requestsfrom bs4 import BeautifulSoupfrom pyecharts.charts import Barheaders = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Referer": "http://m.si
复制链接

扫一扫

专栏目录