Python批量爬取某单位的化妆品生产许可证信息
某单位主页地址
旧 http://125.35.6.84:81/xk/
新 http://scxk.nmpa.gov.cn:81/xk/
主页列表,为Ajax动态加载形式,如下图:
企业页面,也是Ajax动态加载形式,如下图:
以下为代码:
import requests
import json
import time
def get_datas(id="1246978d50094d849fc45defd4d93419"):
'''获取具体企业的详情页数据'''
url2 = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById"
data2 = {"id": id}
headers2 = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.49"
}
dic2_json = requests.post(url=url2, data=data2,
headers=headers2).json() # json为字典格式
f2 = r"C:\Users\Jac\Desktop\动态刷新.json" # 自定义保存路径
with open(f2, "a", encoding="utf-8") as fp:
json.dump(dic2_json, fp=fp, ensure_ascii=False)
# time.sleep(0.5) # 反扒
print(f"正在写入ID: {id}...")
return
def get_ids():
'''批量获取企业ids'''
for page in range(1, 366 + 1): # 循环,总页数 366
url_1 = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList"
data1 = {
"on": "true",
"page": page,
"pageSize": 15,
"productName": "",
"conditionType": 1,
"applyname": "",
}
headers1 = {
"User-Agent":
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.49"
}
dic1_json = requests.post(url=url_1, data=data1,
headers=headers1).json() # json为字典格式
# print(dic1_json)
ids_list = [] # 保存企业 ids
for list in dic1_json["list"]:
ids_list.append(list["ID"])
# print(ids_list)
for ids in ids_list:
get_datas(ids) # 调用函数
print(f"-----第 {page} 页-----已获取-----")
# time.sleep(0.5) # 反扒
return
if __name__ == "__main__":
start_time = time.time()
get_ids()
end_time = time.time()
print(f"耗时 {end_time-start_time:.2f} s.")
print("End.")
注意事项
一定要加 time.sleep(0.5) 延时,或其他反爬措施,否认程序会中途报错!
本文结束,谢谢!(不保留版权)