python爬取基金_Python 爬基金数据

最新推荐文章于 2023-10-06 17:46:42 发布

weixin_39565910

最新推荐文章于 2023-10-06 17:46:42 发布

阅读量297

点赞数

文章标签： python爬取基金

#coding=utf-8

importjsonimportrequestsfrom lxml importetreefrom HTMLParser importHTMLParserfrom pymongo importMongoClient

data= {'pageSize':10,'currentPage':1,'fundingProject.projectNo':'','fundingProject.name':'','fundingProject.person':'','fundingProject.org':'','fundingProject.applyCode':'','fundingProject.grantCode':'','fundingProject.subGrantCode':'','fundingProject.helpGrantCode':'','fundingProject.keyword':'','fundingProject.statYear':'','checkCode':'%E8%AF%B7%E8%BE%93%E5%85%A5%E9%AA%8C%E8%AF%81%E7%A0%81'}

url= 'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action'headers= {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8','Accept-Encoding':'gzip, deflate','Accept-Language':'zh-CN,zh;q=0.9','Cache-Control':'max-age=0','Connection':'keep-alive','Content-Length':'340','Content-Type':'application/x-www-form-urlencoded','Cookie':'JSESSIONID=8BD27CE37366ED8022B42BFC68FF82D4','Host':'npd.nsfc.gov.cn','Origin':'http://npd.nsfc.gov.cn','Referer':'http://npd.nsfc.gov.cn/fundingProjectSearchAction!search.action','Upgrade-Insecure-Requests':'1','User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}defmain():

client= MongoClient('localhost', 27017)

db=client.ScienceFund

db.authenticate("","")

collection=db.science_fundfor i in range(1, 43184):printi

data['currentPage'] =i

result= requests.post(url, data = data, headers =headers)

html=result.text

tree=etree.HTML(html)

table= tree.xpath("//dl[@class='time_dl']")for item intable:

content= etree.tostring(item, method='html')

content=HTMLParser().unescape(content)#print content

bson =jiexi(content)

collection.insert(bson)defjiexi(content):#标题

title1 = content.find('">', 20)

title2= content.find('')

title= content[title1+2:title2]#print title

#批准号

standard_no1 = content.find(u'批准号', title2)

standard_no2= content.find('', standard_no1)

standard_no= content[standard_no1+4:standard_no2].strip()#print standard_no

#项目类别

standard_type1 = content.find(u'项目类别', standard_no2)

standard_type2= content.find('', standard_type1)

standard_type= content[standard_type1+5:standard_type2].strip()#print standard_type

#依托单位

supporting_institution1 = content.find(u'依托单位', standard_type2)

supporting_institution2= content.find('', supporting_institution1)

supporting_institution= content[supporting_institution1+5:supporting_institution2].strip()#print supporting_institution

#项目负责人

project_principal1 = content.find(u'项目负责人', supporting_institution2)

project_principal2= content.find('', project_principal1)

project_principal= content[project_principal1+6:project_principal2].strip()#print project_principal

#资助经费

funds1 = content.find(u'资助经费', project_principal2)

funds2= content.find('', funds1)

funds= content[funds1+5:funds2].strip()#print funds

#批准年度

year1 = content.find(u'批准年度', funds2)

year2= content.find('', year1)

year= content[year1+5:year2].strip()#print year

#关键词

keywords1 = content.find(u'关键词', year2)

keywords2= content.find('', keywords1)

keywords= content[keywords1+4:keywords2].strip()#print keywords

dc ={}

dc['title'] =title

dc['standard_no'] =standard_no

dc['standard_type'] =standard_type

dc['supporting_institution'] =supporting_institution

dc['project_principal'] =project_principal

dc['funds'] =funds

dc['year'] =year

dc['keywords'] =keywordsreturndcif __name__ == '__main__':

main()

weixin_39565910

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫