import requests
import json
def get_company_detail(id):
url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}
params = {
"id": id,
}
response=requests.post(url=url,headers=headers,data=params)
print(response.json())
return response.json()
def get_id(page):
url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList"
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36"
}
params = {
'on': 'true',
'page': page,
'pageSize': '15',
'productName': '',
'conditionType': '1',
'applyname': '',
'applysn': '',
}
response = requests.post(url=url, headers=headers, params=params)
temp = response.json()['list']
id_list = []
for id_dict in temp:
id_list.append(id_dict["ID"])
return id_list
if __name__ == '__main__':
all_data_list=[]
for page in range(1,10):
id_list=get_id(page)
for id in id_list:
id_json=get_company_detail(id)
all_data_list.append(id_json)
with open('./ch5.json','w',encoding='utf-8') as fp:
json.dump(all_data_list,fp=fp,ensure_ascii=False)
print("success")
总结:首页中,的id为实时生成是post请求,再利用得到的id,在具体信息网页中,用post请求爬取具体信息
要点:注意url是抓包中的url,而不是网页地址栏的url