药监局爬取
import requests
from lxml import etree
import os
import json
import time
def get_page(url, params):
headers = {
'User-Agent' : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
}
response = requests.post(url=url, headers=headers, params=params)
if response.status_code == 200:
return response
else:
return None
def main():
url = "http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList"
id_list = []
for i in range(1, 2):
params = {
'on': 'true',
'page': str(i),
'pageSize': '15',
'productName': '',
'conditionType': '1',
'applyname': '',
'applysn': ''
}
response = get_page(url=url, params=params).json()
time.sleep(0.5)
ids = response['list']
for id in ids:
id_list.append(id['ID'])
print(id_list)
info_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById'
information = []
num = 0
for id in id_list:
if num % 15 == 0:
print("爬取第{}页".format(num//15 + 1))
data = {
'id': id
}
info = get_page(url=info_url, params=data).json()
time.sleep(0.1)
info_dic = {}
name = info['businessPerson']
addr = info['epsAddress']
info_dic['负责人'] = name
info_dic['地址'] = addr
information.append(info_dic)
num += 1
print(information)
if __name__ == '__main__':
main()
问题
- 爬取的过程中一开始使用的是Get请求,但是当把页面请求到的时候,发现是没有中间的主题信息的。 从而判定是动态请求,并且从动态请求的那个response里是有中间的主题信息。
找准url,判断是Post请求还是Get请求