需要爬取如下网站https://isisn.nsfc.gov.cn/egr...
目的是进行搜索,但是这个网页是通过ajax动态加载的,并且需要cookies,post的内容含有验证码,验证码每秒更新,请问如何爬取这样的网页?
在这里提供一个查询示例:
项目代码:F030203
资助类别:面上项目
批准年度:2017
post的网页
post的数据
源代码
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2018/4/15 18:12
import requests,json,zlib,gzip,re
with open('curl.txt') as f:
para = f.read()
s = requests.session()
url = 'https://isisn.nsfc.gov.cn/egrantindex/funcindex/prjsearch-list?flag=grid&checkcode='
headers = {
'Origin': 'https://isisn.nsfc.gov.cn',
# 'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
'Content-Type': 'application/x-www-form-urlencoded',
'Accept': 'application/xml, text/xml, */*; q=0.01',
'Referer': 'https://isisn.nsfc.gov.cn/egrantindex/funcindex/prjsearch-list',
'X-Requested-With': 'XMLHttpRequest',
'Connection': 'keep-alive',
# 'Cookie': 'THFqhTnW0hPXnGjMZxctP5lYgKqRyDyDspJ20mjQJ8T12MG5JpxY!330819558!-2052098913; test=69345741; isisn=98184645; org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE=zh_CN; JSESSIONID=Zd1uNLn4tg6QFEWhXZ6Hc8e0ldqtAwWS0NN5mmerlfSyLVoYJe5T!1578882446'
}
cookies = {'sessionidindex':'Nhd1hT2D2bLsDX0fbYPH6gGbpNvFGhG177Dr3BksGFj1MB11czXc!-877234612!180665615',###
'test':'69345741',
'isisn':'98184645',
'org.springframework.web.servlet.i18n.CookieLocaleResolver.LOCALE':'zh_CN',
'JSESSIONID':'Zd1uNLn4tg6QFEWhXZ6Hc8e0ldqtAwWS0NN5mmerlfSyLVoYJe5T!1578882446'}
cookies['sessionidindex'] = re.findall('sessionidindex=(.*?);',para)[0]
# data = {'_search':'false',
# 'nd':'1523792584670',#######
# 'rows':10,
# 'page':'1',
# 'sidx':'',
# 'sord':'desc',
# 'searchString':'resultDate%5E%3AprjNo%253A%252Cctitle%253A%252CpsnName%253A%252CorgName%253A%252CsubjectCode%253AF030203.%25E5%25A4%258D%25E6%259D%2582%25E7%25B3%25BB%25E7%25BB%259F%25E5%258F%258A%25E5%25A4%258D%25E6%259D%2582%25E7%25BD%2591%25E7%25BB%259C%25E7%2590%2586%25E8%25AE%25BA%25E4%25B8%258E%25E6%2596%25B9%25E6%25B3%2595%252Cf_subjectCode_hideId%253AF030203%252CsubjectCode_hideName%253AF030203.%25E5%25A4%258D%25E6%259D%2582%25E7%25B3%25BB%25E7%25BB%259F%25E5%258F%258A%25E5%25A4%258D%25E6%259D%2582%25E7%25BD%2591%25E7%25BB%259C%25E7%2590%2586%25E8%25AE%25BA%25E4%25B8%258E%25E6%2596%25B9%25E6%25B3%2595%252CkeyWords%253A%252Ccheckcode%253A837c%252CgrantCode%253A218%252CsubGrantCode%253A%252ChelpGrantCode%253A%252Cyear%253A2005%252Csqdm%253AF030203%5Btear%5Dsort_name1%5E%3ApsnName%5Btear%5Dsort_name2%5E%3AprjNo%5Btear%5Dsort_order%5E%3Adesc'
# }
#
# data['nd'] = re.findall('nd=(.*?)&',para)[0]
# data['searchString'] = re.findall('searchString=(.*?)\'',para)[0]
data = re.findall('--data \'(.*?)\'',para)[0]
print(cookies['sessionidindex'])
print(data)
# print(data['nd'])
# print(data['searchString'])
data = re.sub('year%253A2005','year%253A{}',data)
for year in range(2005,2017):
r = requests.post(url,data=data.format(year),headers=headers)
print(r.text)