题外话
可有好久没写东西了。目前跳槽啦,上岸啦。公司要做疫情大屏,需要抓取疫情数据。
本来想用爬虫去抓的,奈何我不懂写啊,时间还紧。就直接调用接口这种最简单的方式去实现。
进入主题:
我调用的是腾讯的API。这个还挺好的。但是吧,有点坑。
比如北京风险地区总数是15。但是区域(比如朝阳丰台)都是0.导致数据对不上。
其他还没发现啥毛病。
公司使用正反向代理服务器,所有用到了socks5去读取数据。大家如果网络不隔离,可以忽略socks5这段代码。
调用接口:
api_url="https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=statisGradeCityDetail,diseaseh5Shelf"
直接调用api。解析后落盘本地。我落地的位置是如下,大家可以更改自己的位置。
C:\\Users\\Administrator\\Desktop\\data.xlsx
采集后数据长这个样子
Python代码:
# -*- coding: utf-8 -*-
import requests
import json
import pandas as pd
import time
from sqlalchemy import create_engine
import socks
import datetime
from time import strftime
'''
# -------------------------------------------------
# FileName: national_covid19_epidemic.py
# CreateDate: 2022-10-08 11:05:22
# Author: liyalong
# Contact: liyalong16@crpcg.com
# LastModified:
# SoftWare: IDEA
# Description: 抓取全国疫情信息
# Run: 手动执行:python3 national_covid19_epidemic.py
# perameter:
# doris_table_ddl:
# create table ods_ehs_national_covid19_info(
# city varchar(100) comment '城市',
# city_no int comment '城市编码',
# `data_dt` date comment '统计时间',
# province_city_flag int comment '省市标识:1-省,0-市',
# today_local_confirm_add int comment '今日新增确诊',
# today_confirmint int comment '今日本地确诊',
# today_wzz_add int comment '今日无症状新增',
# totle_dead int comment '累计死亡人数',
# totle_heal int comment '累计治愈人数',
# totle_wzz int comment '累计无症状',
# totle_confirm int comment '累计确诊',
# high_risk_areaNum int comment '高风险区域数',
# medium_risk_areaNum int comment '中风险区域数',
# continue_day_zero_local_confirm int comment '清零天数',
# update_time datetime comment '数据更新时间'
# )
# UNIQUE KEY(city,city_no)
# DISTRIBUTED BY HASH(`city_no`) BUCKETS 8 ;
# -------------------------------------------------
'''
class national_covid19_epidemic:
def __init__(self):
self.epidemic_data = []
self.conn = create_engine('mysql+pymysql://用户:密码@host_ip:port/database')
self.pandas_data=pd.DataFrame
self.dt=datetime.datetime.now().strftime("%Y-%m-%d") #分区时间
def read_api(self, api_url):
"""
通过socks5读取api接口。获取数据
"""
# (username, password, ip, port) = ('user', 'passw0rd', '127.0.0.1', '1234')
# proxies = {'https': f'socks5://{username}:{password}@{ip}:{port}'}
# session = requests.Session()
# session.proxies = proxies
# data = session.get(api_url)
"""
直接读取
"""
data = requests.get(api_url)
china_data = data.json()['data']['diseaseh5Shelf']['areaTree'][0]['children']
print(china_data)
for child in china_data:
if (child["name"] != "境外输入"):
data_dict = {}
data_dict["地区"] = child["name"]
data_dict["城市编码"] = child["adcode"]
data_dict['统计时间'] = self.dt
data_dict["省市标识"] = '1'
data_dict['今日新增确诊'] = child['today']['local_confirm_add']
data_dict['今日本地确诊'] = child['today']['confirm']
data_dict['今日无症状新增'] = child['today']['wzz_add']
data_dict['累计死亡人数'] = child['total']['dead']
data_dict['累计治愈人数'] = child['total']['heal']
data_dict['累计无症状'] = child['total']['wzz']
data_dict['累计确诊'] = child['total']['confirm']
data_dict['高风险区域数'] = child['total']['highRiskAreaNum']
data_dict['中风险区域数'] = child['total']['mediumRiskAreaNum']
data_dict['清零天数'] = child['total']['continueDayZeroLocalConfirmAdd']
data_dict['更新时间'] = child['total']['mtime']
if data_dict["地区"] == "台湾":
data_dict["城市编码"] = "710000"
if data_dict["地区"] == "澳门":
data_dict["城市编码"] = "820000"
if data_dict["地区"] == "香港":
data_dict["城市编码"] = "810000"
if data_dict["地区"] == "北京":
data_dict["城市编码"] = "110100"
if data_dict["地区"] == "天津":
data_dict["城市编码"] = "120100"
if data_dict["地区"] == "上海":
data_dict["城市编码"] = "310100"
if data_dict["地区"] == "重庆":
data_dict["城市编码"] = "500100"
if data_dict["地区"] in ("北京", "上海", "天津", "重庆"):
data_dict["省市标识"] = "0"
# print(data_dict)
self.epidemic_data.append(data_dict)
for city in child["children"]:
data_dict = {}
if(city["name"] not in("地区待确认","境外输入","待确认","外地来津","外地来沪","境外来沪","外地来京","涉奥闭环人员")):
data_dict["地区"] = city["name"]
data_dict["城市编码"] = city["adcode"]
data_dict['统计时间'] = self.dt
data_dict["省市标识"] = '0'
data_dict['今日新增确诊'] = city['today']['local_confirm_add']
data_dict['今日本地确诊'] = city['today']['confirm']
data_dict['今日无症状新增'] = city['today']['wzz_add']
data_dict['累计死亡人数'] = city['total']['dead']
data_dict['累计治愈人数'] = city['total']['heal']
data_dict['累计无症状'] = city['total']['wzz']
data_dict['累计确诊'] = city['total']['confirm']
data_dict['高风险区域数'] = city['total']['highRiskAreaNum']
data_dict['中风险区域数'] = city['total']['mediumRiskAreaNum']
data_dict['清零天数'] = city['total']['continueDayZeroLocalConfirm']
data_dict['更新时间'] = city['total']['mtime']
if data_dict["地区"] == "丰台":
data_dict["城市编码"] = "110106"
# print(data_dict)
self.epidemic_data.append(data_dict)
def pandas_convert(self):
"""
对pandas数据进行处理
"""
self.pandas_data=pd.DataFrame(self.epidemic_data)
self.pandas_data.columns = ['city', 'city_no','data_dt', 'province_city_flag', 'today_local_confirm_add', 'today_confirmint',
'today_wzz_add', 'totle_dead', 'totle_heal', 'totle_wzz', 'totle_confirm',
'high_risk_areaNum', 'medium_risk_areaNum',
'continue_day_zero_local_confirm', 'update_time']
print(self.pandas_data)
def write_csv(self):
"""
写入表格文档
"""
# print(self.pandas_data)
path="C:\\Users\\Administrator\\Desktop\\data.xlsx"
try:
os.remove(path)
except:
print("已删除,无需变动")
self.pandas_data.to_excel(path,index=False,encoding="utf-8")
def write_db(self):
"""
写入数据库
"""
pd.io.sql.to_sql(self.pands_data, name='ods_ehs_national_covid19_epidemic_info', con=self.conn, schema='ods', index=False, index_label=False,if_exists='append', chunksize=1000)
print("成功")
if __name__ == '__main__':
api_url="https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=statisGradeCityDetail,diseaseh5Shelf"
national_covid19_epidemic=national_covid19_epidemic()
national_covid19_epidemic.read_api(api_url)
national_covid19_epidemic.pandas_convert()
national_covid19_epidemic.write_csv()
# national_covid19_epidemic.write_db()
2022-11-11 双十一快乐
更新下新代码,新增全国疫情区域明细,当然这个是参考的别的大佬的API。
# -*- coding: utf-8 -*-
# @Time : 2022-10-19 11:05:22
# @Author : liyalong
# @Email : liyalong16@crpcg.com
# @File : 全国疫情风险地区明细.py
# @Software: PyCharm
import hashlib
import requests
import pandas as pd
import time
import json
import socks
import os
from datetime import date, timedelta
import datetime
class national_covid19_city:
def __init__(self):
self.epidemic_city_data = []
self.pandas_data=pd.DataFrame
# self.dt=datetime.datetime.now().strftime("%Y-%m-%d") #分区时间
self.dt=(date.today() + timedelta(days= 1)).strftime("%Y-%m-%d") #当天跑明天日期
print("分区日期是:"+self.dt)
self.timestamp = str(int((time.time())))
self.req_data=""
self.update_time=""
#全国风险地区统计
self.hcount = 0
self.mcount = 0
self.lcount=0
def get_zdwwsignature(self):
zdwwsign = self.timestamp + 'fTN2pfuisxTavbTuYVSsNJHetwq5bJvC' + 'QkjjtiLM2dCratiA' + self.timestamp
hsobj = hashlib.sha256()
hsobj.update(zdwwsign.encode('utf-8'))
zdwwsignature = hsobj.hexdigest().upper()
return zdwwsignature
def get_signatureheader(self):
token = '23y0ufFl5YxIyGrI8hWRUZmKkvtSjLQA'
nonce = '123456789abcdefg'
passid = 'zdww'
key = "3C502C97ABDA40D0A60FBEE50FAAD1DA"
has256 = hashlib.sha256()
sign_header = self.timestamp + token + nonce + self.timestamp
has256.update(sign_header.encode('utf-8'))
signatureHeader = has256.hexdigest().upper()
# print(signatureHeader)
return signatureHeader
def get_datas(self):
url = 'https://bmfw.www.gov.cn/bjww/interface/interfaceJson'
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
"Content-Type": "application/json; charset=UTF-8",
"Host": "bmfw.www.gov.cn",
"Origin": "http://bmfw.www.gov.cn",
"Referer": "http://bmfw.www.gov.cn/yqfxdjcx/risk.html",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0",
"x-wif-nonce": "QkjjtiLM2dCratiA",
"x-wif-paasid": "smt-application",
"x-wif-signature": self.get_zdwwsignature(),
"x-wif-timestamp": self.timestamp
}
params = {
'appId': "NcApplication",
'paasHeader': "zdww",
'timestampHeader': self.timestamp,
'nonceHeader': "123456789abcdefg",
'signatureHeader': self.get_signatureheader(),
'key': "3C502C97ABDA40D0A60FBEE50FAAD1DA"
}
resp = requests.post(url, headers=headers, json=params)
# print(resp.json())
self.req_data=resp.json()
self.update_time=str(self.req_data["data"]["end_update_time"]).split(" ")[0]
def convert_data(self):
#全国风险地区总数
self.hcount=self.req_data["data"]["hcount"]
self.mcount=self.req_data["data"]["mcount"]
self.lcount=self.req_data["data"]["lcount"]
for data in self.req_data["data"]["highlist"]:
dict = {}
dict["data_dt"]=self.dt
dict["risk_level"]="3"
dict["province"]=data["province"]
dict["city"] = data["city"]
dict["street"] = data["county"]
# dict["communitys"] = data["communitys"]
dict["high_risk_areaNum"] = self.hcount
dict["medium_risk_areaNum"] = self.mcount
dict["low_risk_areaNum"] = self.lcount
self.epidemic_city_data.append(dict)
for data in self.req_data["data"]["middlelist"]:
dict = {}
# dict["data_dt"]=self.update_time
dict["data_dt"]=self.dt
dict["risk_level"] = "2"
dict["province"]=data["province"]
dict["city"] = data["city"]
dict["street"] = data["county"]
# dict["communitys"] = data["communitys"]
dict["high_risk_areaNum"] = self.hcount
dict["medium_risk_areaNum"] = self.mcount
dict["low_risk_areaNum"] = self.lcount
self.epidemic_city_data.append(dict)
for data in self.req_data["data"]["lowlist"]:
dict = {}
# dict["data_dt"]=self.update_time
dict["data_dt"]=self.dt
dict["risk_level"] = "1"
dict["province"]=data["province"]
dict["city"] = data["city"]
dict["street"] = data["county"]
# dict["communitys"] = data["communitys"]
dict["high_risk_areaNum"] = self.hcount
dict["medium_risk_areaNum"] = self.mcount
dict["low_risk_areaNum"] = self.lcount
self.epidemic_city_data.append(dict)
self.pandas_data=pd.DataFrame(self.epidemic_city_data)
def write_csv(self):
"""
写入表格文档
"""
# print(self.pandas_data)
path="C:\\Users\\Administrator\\Desktop\\data_city.xlsx"
try:
os.remove(path)
except:
print("已删除,无需变动")
self.pandas_data.to_excel(path,index=False,encoding="utf-8")
if __name__ == '__main__':
national_covid19_city = national_covid19_city()
national_covid19_city.get_datas()
national_covid19_city.convert_data()
national_covid19_city.write_csv()
# high_lst = national_covid19_city.get_highlist(datas_dic)
# print(high_lst)
# middle_lst = national_covid19_city.get_middlelist(datas_dic)
# print(middle_lst)
# chaxun(high_lst, middle_lst)
# print(datas_dic["data"]["end_update_time"])