数据来源是腾讯新闻,地址是https://news.qq.com/zt2020/page/feiyan.htm#/
本文获取的是历史数据和当日详细数据,返回的数据类型是json数据,其中的provinceCompare字段是各省的总体情况,如果想做国外爬虫可没问题,可以参看我提供的url
除此之外,百度也提供了数据,以前写过,但后来文件找不到了,当初没找到json数据用的是字符串切片,有兴趣的朋友可以试一下,地址是https://voice.baidu.com/act/newpneumonia/newpneumonia/?from=osari_aladin_banner
注意,中国当天的数据在chinaDayList
字段的最后
import pymysql
import time
import json
import traceback #追踪异常
import requests
import utils
def get_tencent_data():
"""
:return: 返回历史数据和当日详细数据
"""
# 一个国内数据接口
# url = 'https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=chinaDayList,chinaDayAddList,nowConfirmStatis,provinceCompare'
# 获取国外疫情数据是这个接口
# url = 'https://api.inews.qq.com/newsqa/v1/automation/modules/list?modules=FAutoGlobalStatis,FAutoGlobalDailyList,FAutoCountryConfirmAdd'
url = 'https://view.inews.qq.com/g2/getOnsInfo?name=disease_other'
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36',
}
r = requests.get(url, headers)
res = json.loads(r.text) # json字符串转字典
data_all = res['data']
data_all = json.loads(data_all)
history = [] # 全国历史数据
for i in data_all["chinaDayList"]:
ds = "2020." + i["date"]
tup = time.strptime(ds, "%Y.%m.%d")
ds = time.strftime("%Y-%m-%d", tup) # 改变时间格式,不然插入数据库会报错,数据库是datetime类型
i['date'] = ds
del i['y']
history.append(i)
history_add = [] # 全国历史新增类型的数据
for i in data_all["chinaDayAddList"]:
ds = "2020." + i["date"]
tup = time.strptime(ds, "%Y.%m.%d")
ds = time.strftime("%Y-%m-%d", tup) # 改变时间格式,不然插入数据库会报错,数据库是datetime类型
i['date'] = ds
del i['y']
history_add.append(i)
city = [] # 各城市当日详细数据
data_province = data_all["statisGradeCityDetail"] # 中国各省今天情况,provinceCompare是各省的总体情况
for city_infos in data_province:
province = city_infos["province"]
name = city_infos["city"]
confirm = city_infos["confirm"] # 累计确诊
confirm_add = city_infos["confirmAdd"] # 新增
heal = city_infos["heal"] # 治愈
dead = city_infos["dead"] # 死亡
ds = str(city_infos['syear'])+'/' + city_infos["date"]
tup = time.strptime(ds, "%Y/%m/%d")
ds = time.strftime("%Y-%m-%d", tup)
city.append({'name':name, 'province':province, 'confirm':confirm, 'confirm_add':confirm_add, 'heal':heal, 'dead':dead,'time':ds})
pro = [] # 各省的数据
for pro_name,pro_info in data_all['provinceCompare'].items():
name = pro_name
nowConfirm = pro_info['nowConfirm']
confirmAdd = pro_info['confirmAdd']
dead = pro_info['dead']
heal = pro_info['heal']
zero = pro_info['zero']
pro.append({'name':name,'now_confirm':nowConfirm,'confirm_add':confirmAdd,'dead':dead,'heal':heal,'zero':zero,'date':ds})
return history,history_add,city,pro
print('执行spyder.py文件')
if __name__ == '__main__':
data = get_tencent_data()
[print(i) for i in data]
以前还写过一个低级版本的,也放上来吧,能获取到的信息也是很多的,只不过我只提取了各省份的信息
import requests
import json
# url='https://voice.baidu.com/act/newpneumonia/newpneumonia/?from=singlemessage&qq-pf-to=pcqq.c2c'
# url='https://voice.baidu.com/act/newpneumonia/newpneumonia/?from=singlemessage'
# 还有一个http://ncov.deepeye.tech/
# https://news.qq.com/zt2020/page/feiyan.htm#/?pool=bj&nojump=2
url='https://view.inews.qq.com/g2/getOnsInfo?name=disease_h5&callback=jQuery341027523371041305267_1592721097964&_=1592721097965'
response=requests.get(url).text
first_index=response.find('(')
response=response[first_index+1:-1]
# re.findall('id="captain-config">',response)
data=json.loads(response)['data']
data=json.loads(data)['areaTree'][0]['children'] # 得到一个列表,里面是我们想要的信息
for i in data:
print(i)
name=i['name'] # 疫情地区
today_add=i['today']['confirm'] # 新增
nowConfirm=i['total']['nowConfirm'] # 现有
confirm = i['total']['confirm'] # 累计
heal = i['total']['heal'] # 治愈
dead = i['total']['dead'] # 死亡
# print((name,today_add,nowConfirm,confirm,heal,dead))
# print(i,data[i])