python 对“新型冠状病毒感染数据”的爬取和简要分析
0x01 数据来源
数据来源于腾讯对新型冠状病毒数据的收集整理:
本次,python对该数据的收集主要是调用了腾讯的数据接口:https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5,腾讯的数据接口返回的主要是json格式的数据,易于处理:
0x02 数据格式
1、数据返回的格式为:json
2、数据存储的格式:
根据数据爬取的日期,存储为:
文件名 | 数据内容 |
---|---|
2020-02-11-ChinaDayStatistics.data | 20200211获取国内日统计数据 |
2020-02-11-ChinaDayAddStatistics.data | 20200211获取国内日增加数据 |
2020-02-11-WorldStatistics.data | 20200211爬取的全世界日统计数据 |
ChinaAdd.data | 中国日增加数据统计 |
ChinaTotal.data | 全国范围内感染数据总统计 |
0x03 数据展示
1、2020-02-11-ChinaDayStatistics.data
中国国内日统计数据
2、2020-02-11-ChinaDayAddStatistics.data
中国国内日新增数据
3、20200-02-11-WorldStatistics.data
全世界范围内,日统计数据
4、ChinaAdd.data
中国国内新增统计
5、ChinaTotal.data
中国国内总统计*
0x04 可视化展示
1、各省份确证饼状图
目前,只是通过pyechats统计了世界范围内日确诊数量在各个省份间的病状分布图,以2020-02-11日的统计数据为例展示如下:
2、全国日统计数据轮播图
生成的html文档中,当点击pause按钮式,自动轮播当前统计数据,其中x轴为统计的日期,y轴为数据
0x05 代码
1、数据爬取部分代码
019-nCoV.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/2/5/005 14:14
# @Author : H
# @File : 2019-nCoV.py
import requests
import json
import os
baseurl = "https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5"
res = requests.get(baseurl)
if res.ok:
data = res.json()
data = json.loads(data['data'])
chinaTotal = data['chinaTotal']
chinaAdd = data['chinaAdd']
lastUpdateTime = data['lastUpdateTime']
updateTime = lastUpdateTime
areaTree = data['areaTree']
chinaDayList = data['chinaDayList']
chinaDayAddList = data['chinaDayAddList']
lastUpdateTime = lastUpdateTime.split(" ")[0]
filename1 = "../data/" + lastUpdateTime + "-WorldStatistics.data"
filename2 = "../data/" + lastUpdateTime + "-ChinaDayStatistics.data"
filename3 = "../data/" + lastUpdateTime + "-ChinaDayAddStatistics.data"
filename4 = "../data/ChinaTotal.data"
filename5 = "../data/ChinaAdd.data"
if os.path.exists(filename1):
os.remove(filename1)
if os.path.exists(filename2):
os.remove(filename2)
if os.path.exists(filename3):
os.remove(filename3)
for k,v in data.items():
print(k,v)
with open(filename4, "a", encoding='utf-8')as f:
f.writelines(str(updateTime) + "#" + str(chinaTotal['confirm']) + "#" + str(chinaTotal['suspect']) + "#" + str(chinaTotal['dead']) + "#" + str(chinaTotal['heal']) + '\n')
with open(filename5, "a", encoding='utf-8')as f:
f.writelines(str(updateTime) + "#" + str(chinaAdd['confirm']) + "#" + str(chinaAdd['suspect']) + "#" + str(chinaAdd['dead']) + "#" + str(chinaAdd['heal']) + '\n')
tempworlddata = ''
with open(filename1,"a",encoding='utf-8')as f:
title = "province#city#totalConfirm#totalSuspect#totalDead#totalHeal#todayConfirm#todaySuspect#todayDead#todayHeal\n"
f.writelines(title)
for i in areaTree:
if i['name'] == '中国':
for j in i['children']:
for k in j['children']:
tempworlddata = j['name'] + "#" + k['name']+ "#" + str(k['total']['confirm']) + "#" + str(k['total']['suspect']) + "#" + str(k['total']['dead']) + "#" + str(k['total']['heal'])+ "#" + str(k['today']['confirm']) + "#" + str(k['today']['suspect']) + "#" + str(k['today']['dead']) + "#" + str(k['today']['heal'])
f.writelines(tempworlddata + '\n')
tempworlddata = ''
print(j['name'],k)
else:
print(i)
tempworlddata = i['name'] + "#" + i['name']+ "#" + str(i['total']['confirm']) + "#" + str(i['total']['suspect']) + "#" + str(i['total']['dead']) + "#" + str(i['total']['heal'])+ "#" + str(i['today']['confirm']) + "#" + str(i['today']['suspect']) + "#" + str(i['today']['dead']) + "#" + str(i['today']['heal'])
f.writelines(tempworlddata + '\n')
tempworlddata = ''
tempchinadaydata = ''
with open(filename2, "a", encoding='utf-8')as f:
f.writelines("date#confirm#suspect#dead#heal#deadRate#healRate\n")
for i in chinaDayList:
tempchinadaydata = str(i['date']) + "#" + str(i['confirm']) + "#" + str(i['suspect']) + "#" + str(i['dead']) + "#" + str(i['heal']) + "#" + str(i['deadRate']) + "#" + str(i['healRate'])
f.writelines(tempchinadaydata + '\n')
tempchinadaydata = ''
print(i)
tempchinadayadddate = ''
with open(filename3, "a", encoding='utf-8')as f:
f.writelines("date#confirm#suspect#dead#heal#deadRate#healRate\n")
for i in chinaDayAddList:
tempchinadayadddate = str(i['date']) + "#" + str(i['confirm']) + "#" + str(i['suspect']) + "#" + str(i['dead']) + "#" + str(i['heal']) + "#" + str(i['deadRate']) + "#" + str(i['healRate'])
f.writelines(tempchinadayadddate + '\n')
tempchinadayadddate = ''
print(i)
2、数据分析部分~~各省内确诊病状图
fenxi2.py
得到的html文档,建议使用Google浏览器打开
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/2/6/006 0:15
# @Author : H
# @File : fenxi2.py
import pyecharts
from pyecharts import options as opts
from pyecharts.charts import Page, Pie
from pyecharts.commons.utils import JsCode
from pyecharts.faker import Collector, Faker
filename = "../data/2020-02-11-WorldStatistics.data"
C = Collector()
tempcity = []
temptotalconfirm = []
tempprovience = []
datacity = []
data = {}
for i in open(filename,"r",encoding='utf-8'):
i = i.strip('\n')
province = i.split("#")[0]
if province != 'province' and province not in tempprovience:
tempprovience.append(province)
for j in tempprovience:
for i in open(filename,"r",encoding='utf-8'):
i = i.strip('\n')
province = i.split("#")[0]
city = i.split("#")[1]
totalConfirm = i.split("#")[2]
totalSuspect = i.split("#")[3]
totalDead = i.split("#")[4]
totalHeal = i.split("#")[5]
todayConfirm = i.split("#")[6]
todaySuspect = i.split("#")[7]
todayDead = i.split("#")[8]
todayHeal = i.split("#")[9]
if province != 'province':
if j == province:
tempcity.append(city)
temptotalconfirm.append(totalConfirm)
# datacity.append([city,totalConfirm])
else:
# data[j] = datacity
data[j] = [tempcity,temptotalconfirm]
tempcity = []
temptotalconfirm = []
datacity = []
def huatu(k,m,n):
@C.funcs
def pie_base() -> Pie:
c = (
Pie()
.add("",
[list(z) for z in zip(m, n)],
center=["60%", "60%"],
)
.set_global_opts(
title_opts=opts.TitleOpts(title=k + "确诊饼状图"),
legend_opts=opts.LegendOpts(pos_left="20%"),
)
.set_series_opts(label_opts=opts.LabelOpts(formatter="{b}: {c}"))
)
return c
for k,v in data.items():
huatu(k,v[0],v[1])
Page().add(*[fn() for fn, _ in C.charts]).render()
3、数据分析部分~~国内日统计数据轮播图
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2020/2/6/006 1:55
# @Author : H
# @File : ChinaDayStatistics.py
from pyecharts.commons.utils import JsCode
from pyecharts import options as opts
from pyecharts.charts import Bar, BMap, Grid, Map, Page, Pie, Sankey, Timeline
from pyecharts.faker import Collector, Faker
C = Collector()
filename = "../data/2020-02-11-ChinaDayStatistics.data"
obj = ['confirm', 'suspect', 'dead', 'heal','deadRate','healRate']
data = {}
for i in open(filename, "r", encoding='utf-8'):
i = i.strip('\n')
date = i.split("#")[0]
confirm = i.split("#")[1]
suspect = i.split("#")[2]
dead = i.split("#")[3]
heal = i.split("#")[4]
deadRate = i.split("#")[5]
healRate = i.split("#")[6]
if date != 'date':
data[date] = [confirm,suspect,dead,heal,deadRate,healRate]
@C.funcs
def timeline_bar_reversal() -> Timeline:
tl = Timeline()
for k,v in data.items():
bar = (
Bar()
.add_xaxis(obj)
.add_yaxis(
"国内日感染数据", v, label_opts=opts.LabelOpts(position="right")
)
.reversal_axis()
.set_global_opts(
title_opts=opts.TitleOpts("Timeline-Bar-Reversal (时间: {} 号)".format(k))
)
)
tl.add(bar, "{}号".format(k))
return tl
@C.funcs
def timeline_bar_with_graphic() -> Timeline:
x = obj
tl = Timeline()
for k,v in data.items():
bar = (
Bar()
.add_xaxis(x)
.add_yaxis("国内日感染数据",v)
.set_global_opts(
title_opts=opts.TitleOpts("国内{}号统计".format(k)),
graphic_opts=[
opts.GraphicGroup(
graphic_item=opts.GraphicItem(
rotation=JsCode("Math.PI / 4"),
bounding="raw",
right=50,
bottom=60,
z=50,
),
children=[
opts.GraphicRect(
graphic_item=opts.GraphicItem(
left="center", top="center", z=100
),
graphic_shape_opts=opts.GraphicShapeOpts(
width=400, height=50
),
graphic_basicstyle_opts=opts.GraphicBasicStyleOpts(
fill="rgba(0,0,0,0.3)"
),
),
opts.GraphicText(
graphic_item=opts.GraphicItem(
left="center", top="center", z=100
),
graphic_textstyle_opts=opts.GraphicTextStyleOpts(
text="国内{}号统计".format(k),
font="bold 18px Microsoft YaHei",
graphic_basicstyle_opts=opts.GraphicBasicStyleOpts(
fill="#fff"
),
),
),
],
)
],
)
)
tl.add(bar, "{}号".format(k))
return tl
Page().add(*[fn() for fn, _ in C.charts]).render()