# 功能说明:提取财政部PPP相关要素。
# 先从各页提取项目链接,再按链接提取各项目要素,再进入下一页;“采购社会资本方式”从另一个链接提取
# 日期在Excel里是字符串格式,需要转换成日期格式,粘贴到记事本或使用DATEVALUE函数
import datetime,time,random,os
import requests,re
from bs4 import BeautifulSoup
import pandas as pd
starttime = datetime.datetime.now()
startPage="http://www.cpppc.org:8086/pppcentral/map/getPPPList.do" #用于寻找每个项目的PROJ_ID
baseUrl='http://www.cpppc.org:8083/efmisweb/ppp/projectLibrary/getProjInfo.do?projId=' #与PROJ_ID拼接成各个项目的链接
baseUrl2='http://www.cpppc.org:8083/efmisweb/ppp/projectLibrary/getProjInfoNational.do?projId=' #另外一个链接,提取“采购社会资本方式”数据
myParams= {'queryPage': '1','projStateType':'0'} #翻页用,也可以加入其他选项
myHeader={
'Accept':'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding':'gzip, deflate',
'Accept-Language':'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7',
'Connection':'keep-alive',
'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
'Cookie':'JSESSIONID=7A442574FB143BC29453ECFA1A10CCC2; _site_id_cookie=82; clientlanguage=zh_CN; faspjsessionid=201xEsLqLL6_eaSNWmga7x05Z-tO19aSbqBEJZgBol-6t836nruw!1514000050; JSESSIONID=IF12fM4-OOkAeZCsfI-E3ErRwaXDf9AAuuANnExxdWHvMxHtrAo5!-549990464',
'Host':'www.cpppc.org:8086',
'Origin':'http://www.cpppc.org:8086',
'Referer':'http://www.cpppc.org:8086/pppcentral/map/toPPPList.do',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36',
'X-Requested-W
爬虫:财政部PPP项目库
最新推荐文章于 2024-11-15 14:43:32 发布