爬取第一个页面所对应的第二个页面的信息,两个页面的数据均为Ajax动态加载出来的,同破解百度翻译爬取方法。(url为抓包工具下的Network-XHR,复制该url,post_url同理)
import json
import requests
if __name__ == '__main__':
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 '
'Safari/537.36 Core/1.70.3877.400 QQBrowser/10.8.4506.400 '
}
id_list = [] # 存储企业的id
all_data_list = [] # 持久化存储
# 获取药监局信息
url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList'
for page in range(1, 6): # 获取第1到第6页的数据
data = {
'on': 'true',
'page': page,
'pageSize': '15',
'productName': '',
'conditionType': '1',
'applyname': '',
'applysn': '',
}
json_ids = requests.post(url=url, headers=headers, data=data).json()
for dic in json_ids['list']: # 遍历企业id
id_list.append(dic['ID'])
# 获取企业详情数据
post_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
for id in id_list:
data = {
'id': id
}
detail_json = requests.post(url=post_url, headers=headers, data=data).json()
print(detail_json)
# 持久化存储
#all_data_list.append(detail_json)
#fp = open('./药监局.json', 'w', encoding='utf-8')
#json.dump(all_data_list, fp=fp, ensure_ascii=False)
#print('over!!!')