#coding=utf-8
importjsonimportrequestsfrom lxml importetreefrom HTMLParser importHTMLParserfrom pymongo importMongoClient
data= {'pageSize':10,'currentPage':1,'fundingProject.projectNo':'','fundingProject.name':'','fundingProject.person':'','fundingProject.org':'','fundingProject.applyCode':'','fundingProject.grantCode':'','fundingProject.subGrantCode':'','fundingProject.helpGrantCode':'','fundingProject.keyword':'','fundingProject.statYear':'','checkCode':'%E8%AF%B7%E8%BE%93%E5%85%A5%E9%AA%8C%E8%AF%81%E7%A0%81'}
url= 'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action'headers= {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding':'gzip, deflate','Accept-Language':'zh-CN,zh;q=0.9','Cache-Control':'max-age=0','Connection':'keep-alive','Content-Length':'340','Content-Type':'application/x-www-form-urlencoded','Cookie':'JSESSIONID=8BD27CE37366ED8022B42BFC68FF82D4','Host':'npd.nsfc.gov.cn','Origin':'http://npd.nsfc.gov.cn','Referer':'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action','Upgrade-Insecure-Requests':'1','User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}defmain():
client= MongoClient('localhost', 27017)
db=client.ScienceFund
db.authenticate("","")
collection=db.science_fundfor i in range(1, 43184):printi
data['currentPage'] =i
result= requests.post(url, data = data, headers =headers)
html=result.text
tree=etree.HTML(html)
table= tree.xpath("//dl[@class='time_dl']")for item intable:
content= etree.tostring(item, method='html')
content=HTMLParser().unescape(content)#print content
bson =jiexi(content)
collection.insert(bson)defjiexi(content):#标题
title1 = content.find('">', 20)
title2= content.find('')
title= content[title1+2:title2]#print title
#批准号
standard_no1 = content.find(u'批准号', title2)
standard_no2= content.find('', standard_no1)
standard_no= content[standard_no1+4:standard_no2].strip()#print standard_no
#项目类别
standard_type1 = content.find(u'项目类别', standard_no2)
standard_type2= content.find('', standard_type1)
standard_type= content[standard_type1+5:standard_type2].strip()#print standard_type
#依托单位
supporting_institution1 = content.find(u'依托单位', standard_type2)
supporting_institution2= content.find('', supporting_institution1)
supporting_institution= content[supporting_institution1+5:supporting_institution2].strip()#print supporting_institution
#项目负责人
project_principal1 = content.find(u'项目负责人', supporting_institution2)
project_principal2= content.find('', project_principal1)
project_principal= content[project_principal1+6:project_principal2].strip()#print project_principal
#资助经费
funds1 = content.find(u'资助经费', project_principal2)
funds2= content.find('', funds1)
funds= content[funds1+5:funds2].strip()#print funds
#批准年度
year1 = content.find(u'批准年度', funds2)
year2= content.find('', year1)
year= content[year1+5:year2].strip()#print year
#关键词
keywords1 = content.find(u'关键词', year2)
keywords2= content.find('', keywords1)
keywords= content[keywords1+4:keywords2].strip()#print keywords
dc ={}
dc['title'] =title
dc['standard_no'] =standard_no
dc['standard_type'] =standard_type
dc['supporting_institution'] =supporting_institution
dc['project_principal'] =project_principal
dc['funds'] =funds
dc['year'] =year
dc['keywords'] =keywordsreturndcif __name__ == '__main__':
main()