最近用python写了个全国疫情中高风险地区查询的爬虫代码,分享给大家一起交流,希望得到不同思路的指教,让代码更简洁,运行效率更高。
总体思路
1、找到可供查询的源网站
2、分析、获取查询的API
3、构造API
4、获取全国中高风险地区信息
5、对信息按“省、市、街道”进行整理
6、构造查询匹配语句,判断输入的“省、市、县”是否在获取的信息中
注:输入匹配部分的代码自己总觉得不是最优解,希望能有大佬看到,指点一二。
代码部分
# -*- coding: utf-8 -*-
# @Time : 2022/5/12 11:08
# @Author : Kyln.Wu
# @Email : kylnwu@qq.com
# @File : 疫情风险地区查询.py
# @Software: PyCharm
import hashlib
import json
import difflib
import requests
import time
# 获取当前时间戳
timestamp = str(int((time.time())))
# print(timestamp)
# 定义几个解密需要用到的常量
token = '*********************' # 这里不让写全部密钥,写在注释里了:23y0ufFl5YxIyGrI8hWRUZmKkvtSjLQA
nonce = '123456789abcdefg'
passid = 'zdww'
key = "3C502C97ABDA40D0A60FBEE50FAAD1DA"
# 用python逆向Headers里要插入的zdwwsignature变量的值
def get_zdwwsignature():
zdwwsign = timestamp + 'fTN2pfuisxTavbTuYVSsNJHetwq5bJvC' + 'QkjjtiLM2dCratiA' + timestamp
hsobj = hashlib.sha256()
hsobj.update(zdwwsign.encode('utf-8'))
zdwwsignature = hsobj.hexdigest().upper()
# print(zdwwsignature)
return zdwwsignature
# 用python逆向Params里要插入的signatureheader变量的值
def get_signatureheader():
has256 = hashlib.sha256()
sign_header = timestamp + token + nonce + timestamp
has256.update(sign_header.encode('utf-8'))
signatureHeader = has256.hexdigest().upper()
# print(signatureHeader)
return signatureHeader
# 这里才是主函数,通过api接口,带入需要的参数,获取全国的数据
def get_datas():
url = 'https://bmfw.www.gov.cn/bjww/interface/interfaceJson'
headers = {
"Accept": "application/json, text/javascript, */*; q=0.01",
"Accept-Encoding": "gzip, deflate, br",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
# "Content-Length": "235",
"Content-Type": "application/json; charset=UTF-8",
"Host": "bmfw.www.gov.cn",
"Origin": "http://bmfw.www.gov.cn",
"Referer": "http://bmfw.www.gov.cn/yqfxdjcx/risk.html",
# "Sec-Fetch-Dest": "empty",
# "Sec-Fetch-Mode": "cors",
# "Sec-Fetch-Site": "cross-site",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.87 Safari/537.36 SE 2.X MetaSr 1.0",
"x-wif-nonce": "QkjjtiLM2dCratiA",
"x-wif-paasid": "smt-application",
"x-wif-signature": get_zdwwsignature(),
"x-wif-timestamp": timestamp
}
params = {
'appId': "NcApplication",
'paasHeader': "zdww",
'timestampHeader': timestamp,
'nonceHeader': "123456789abcdefg",
'signatureHeader': get_signatureheader(),
'key': "3C502C97ABDA40D0A60FBEE50FAAD1DA"
}
resp = requests.post(url, headers=headers, json=params)
datas = resp.text
# 在线获取后,保存到本地,再进行本地整理操作,减少在线访问,以免被封IP
with open('./risk_data.log', 'w', encoding='utf-8') as f:
f.write(datas)
# 获取data中highlist部分数据,即高风险地区数据
def get_highlist(data):
highlist = data['data']['highlist']
return highlist
# 获取data中middlelist部分数据,即中风险地区数据
def get_middlelist(data):
middlelist = data['data']['middlelist']
return middlelist
# 查询中高风险地区的函数。这部分的算法不是很严谨,有待改进。
def chaxun(high_list, middle_list):
# 用列表推导式从high_list中取出province的值,然后用set()去重,再转换成列表
high_provinces = list(set([x['province'] for x in high_list]))
high_citys = list(set([x['city'] for x in high_list]))
high_countys = list(set([x['county'] for x in high_list]))
print(f'高风险省/直辖市:{high_provinces}')
print(f'高风险市/区:{high_citys}')
print(f'高风险县/街道:{high_countys}')
middle_provinces = list(set([x['province'] for x in middle_list]))
middle_citys = list(set([x['city'] for x in middle_list]))
middle_countys = list(set([x['county'] for x in middle_list]))
print(f'中风险省/直辖市:{middle_provinces}')
print(f'中风险市/区:{middle_citys}')
print(f'中风险县/街道:{middle_countys}')
while True:
province_in = input('请输入来自省/直辖市:')
if len(province_in) == 0:
print('输入省/直辖市不能为空!')
break
city_in = input('请输入来自市/区:')
if len(city_in) == 0:
print('输入市/区不能为空!')
break
county_in = input('请输入来自县/街道:')
if len(county_in) == 0:
print('输入县/街道不能为空!')
break
# 用difflib.get_close_matches()方法从high_provinces中取出1个与province_in最匹配的值,近似度0.6
high_province = difflib.get_close_matches(province_in, high_provinces, 1, cutoff=0.6)
# print(f'high_province:{high_province}')
high_city = difflib.get_close_matches(city_in, high_citys, 1, cutoff=0.6)
# print(f'high_city:{high_city}')
high_county = difflib.get_close_matches(county_in, high_countys, 1, cutoff=0.6)
# print(f'high_county:{high_county}')
middle_province = difflib.get_close_matches(province_in, middle_provinces, 1, cutoff=0.6)
# print(f'middle_province:{middle_province}')
middle_city = difflib.get_close_matches(city_in, middle_citys, 1, cutoff=0.6)
# print(f'middle_city:{middle_city}')
middle_county = difflib.get_close_matches(county_in, middle_countys, 1, cutoff=0.6)
# print(f'middle_county:{middle_county}')
# 构造所有可能的中高风险条件
# 高、高、高
if high_province and high_city and high_county:
print(f'{province_in},{city_in},{county_in} 为高风险省/直辖市,市/区,县/街道!!')
# 高、高、中
elif high_province and high_city and not high_county and not middle_province and not middle_city and middle_county:
print(f'{province_in} 为高风险省/直辖市,{city_in} 为高风险市/区,{county_in} 为中风险县/街道。')
# 高、高、低
elif high_province and high_city and not high_county and not middle_province and not middle_city and not middle_county:
print(f'{province_in} 为高风险省/直辖市,{city_in} 为高风险市/区,{county_in} 为低风险县/街道。')
# 高、低、低
elif high_province and not high_city and not high_county and not middle_province and not middle_city and not middle_county:
print(f'{province_in} 为高风险省/直辖市,{city_in} 为低风险市/区,{county_in} 为低风险县/街道。')
# 高、中、中
elif high_province and not high_city and not high_county and middle_province and middle_city and not middle_county:
print(f'{province_in} 为高风险省/直辖市,{city_in} 为中风险市/区,{county_in} 为中风险县/街道。')
# 高、中、低
elif high_province and not high_city and not high_county and not middle_province and middle_city and not middle_county:
print(f'{province_in} 为高风险省/直辖市,{city_in} 为中风险市/区,{county_in} 为低风险县/街道。')
# 中、中、中
elif not high_province and not high_city and not high_county and middle_province and middle_city and middle_county:
print(f'{province_in} 为中风险省/直辖市,{city_in} 为中风险市/区,{county_in} 为中风险县/街道。')
# 中、中、低
elif not high_province and not high_city and not high_county and middle_province and middle_city and not middle_county:
print(f'{province_in} 为中风险省/直辖市,{city_in} 为中风险市/区,{county_in} 为低风险县/街道。')
# 中、低、低
elif not high_province and not high_city and not high_county and middle_province and not middle_city and not middle_county:
print(f'{province_in} 为中风险省/直辖市,{city_in} 为低风险市/区,{county_in} 为低风险县/街道。')
# 低、低、低
elif not high_province and not high_city and not high_county and not middle_province and not middle_city and not middle_county:
print(f'{province_in},{city_in},{county_in} 为低风险省/直辖市,市/区,县/街道。')
else:
print(f'不在中高风险列表中,或行政区域不匹配,请检查!!')
loop = int(input('是否继续查询?1-继续,0-退出。'))
if loop == 0:
break
if __name__ == '__main__':
# 第一次运行时需要打开下面的注释,这样才能先从网上获取到数据
# get_datas()
# 以下是读取本地数据来进行查询的,前提是要先运行一次上一行get_datas()
with open('./risk_data.log', 'r', encoding='utf-8') as f:
datas_dic = json.loads(f.read())
high_lst = get_highlist(datas_dic)
# print(high_list)
middle_lst = get_middlelist(datas_dic)
# print(middle_list)
chaxun(high_lst, middle_lst)
运行结果
以上是运行结果,输入信息支持模糊查询