python 爬取世界各国以及中国各省份疫情历史数据
# title : data_sync
#description : 获取世界各国以及中国各省份疫情历史数据
#author : qianyulin
#email : qianyulin777@163.com
#date : 2021-10-30 12:00:00
#version : 1.0
#usage : python3 yq_data_pro.py
#python_version: 3.7.2
#======================================================================================================================================================================================
import requests
import pandas as pd
from tqdm import tqdm
import time
class WorldVirusSpider(object):
def __init__(self):
self.area_url = 'https://c.m.163.com/ug/api/wuhan/app/data/list-total'
self.home_url = 'https://c.m.163.com/ug/api/wuhan/app/data/list-by-area-code?areaCode='
def get_json_from_url(self, url):
"""
根据url,获取响应内容的字符串数据
:param url :请求url
:return: 响应内容的字符串
"""
headers = {"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.71 Safari/537.36"}
response = requests.get( url= url,headers = headers)
return response.json()
def data_analyz(self,area_name,data):
"""
解析json内容,获取Python数据
:param data:首页内容
:return: 解析后的Python数据
add_confirm # 新增确诊
total_confirm #累计确诊
heal #累计治愈
dead #累计死亡
now_confirm #现有确诊
"""
date = data['date']
add_confirm = data['today']['confirm']
total_confirm = data['total']['confirm']
heal = data['total']['heal']
dead = data['total']['dead']
now_confirm = total_confirm - heal - dead
return {'area_name':area_name,'date':date,'now_confirm':now_confirm,'add_confirm':add_confirm,'total_confirm':total_confirm,'heal':heal,'dead':dead}
def get_countryAndcity_id(self):
# 获取 全世界国家代码及城市代码
area_code_list = []
data_json = self.get_json_from_url(self.area_url)
data = data_json['data']['areaTree']
for country_list in data:
if country_list['name'] == '中国':
for province_list in country_list['children'] :
for city_list in province_list['children']:
area_code_list.append({'country_name':country_list['name'],'country_id':country_list['id'],'province_name':province_list['name'],'province_id':province_list['id'],'city_name':city_list['name'],'city_id':city_list['id']})
else:
area_code_list.append({'country_name': country_list['name'], 'country_id': country_list['id'],'province_name': 'null', 'province_id': 'null','city_name': 'null', 'city_id': 'null'})
area_code_df = pd.DataFrame(area_code_list)
return area_code_df
# 获取不同代码的历史数据
def get_area_yq_data_his(self,area_code,area_name):
yq_datalist = []
data_json = self.get_json_from_url(self.home_url+area_code)
yq_json = data_json['data']['list']
for data in yq_json:
yq_datalist.append(self.data_analyz(area_name,data))
return pd.DataFrame(yq_datalist)
def list_with(self,Series):
"""
df去重
:param Series:andas.core.series.Series
:return: str
"""
return ''.join(list(set(Series)))
def save_to_excel(self,df,sheetName):
with pd.ExcelWriter('/Users/qianyulin/Desktop/dongzhang/yq_data/yq_data_all.xlsx') as writer:
df.to_excel(excel_writer=writer,sheet_name=sheetName)
writer.save()
def get_res_code(self):
# 获取目标区域的id 根据自己需求来
res_area_code_list = []
area_df = self.get_countryAndcity_id()
res_area_code_list.append({'area_code':self.list_with(area_df[area_df.country_name == '中国']['country_id']), 'area_name':'中国'})
res_area_code_list.append({'area_code':self.list_with(area_df[area_df.province_name == '上海']['province_id']), 'area_name':'上海'})
res_area_code_list.append({'area_code':self.list_with(area_df[area_df.province_name == '浙江']['province_id']), 'area_name': '浙江'})
res_area_code_list.append({'area_code':self.list_with(area_df[area_df.province_name == '安徽']['province_id']), 'area_name': '安徽'})
res_area_code_list.append({'area_code':self.list_with(area_df[area_df.province_name == '江苏']['province_id']), 'area_name': '江苏'})
res_area_code_list.append({'area_code':self.list_with(area_df[area_df.province_name == '福建']['province_id']), 'area_name': '福建'})
res_area_code_list.append({'area_code':self.list_with(area_df[area_df.province_name == '江西']['province_id']), 'area_name': '江西'})
return res_area_code_list
def run(self):
try:
_con_df = pd.DataFrame([{'area_name':'null','date':'null','now_confirm':'null','add_confirm':'null','total_confirm':'null','heal':'null','dead':'null'}])
_area_list = self.get_res_code()
for area in tqdm(_area_list,desc = 'Processing'):
df = self.get_area_yq_data_his(area['area_code'],area['area_name'])
_con_df = pd.concat([_con_df,df],axis=0) #0 行拼接 1 列拼接
time.sleep(0.5)
_con_df= _con_df[(_con_df['area_name'] != 'null')]
self.save_to_excel(_con_df,'confirm')
print('=========Writer data Successful!=======')
except:
print('==================Error!===============')
if __name__ == '__main__':
spider = WorldVirusSpider()
spider.run()