一个很简单的信息公示网站上面的新闻。
请求方式是Post,参数信息直接去网站上面请求一次,然后复制过来就行了。
数据解析用了BeautifulSoup4框架,根据属性搜索相应的属性,简简单单
from bs4 import BeautifulSoup
import requests
allNewsUrl = []
pattern = "http.*html"
allInfo = []
class Info:
projName = "" # 项目名字
companyName = "" # 中标的公司名字
# 先把所有的新闻抓下来
def get_AllNews(PageIndex):
header = {
'Accept': 'text/html,application/xhtml+xml,application/xmlq=0.9,image/avif,image/webp,'
'image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Accept-Encoding': 'gzip,deflate',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Content-Length': '450',
'Content-Type': 'application/x-www-form-urlencoded',
# cookie每天都会改变,要去网站上面复制
'Cookie': 'JSESSIONID=F587B93718F44E84DB5C1919DFE9D9CF; JSESSIONID=19DD7636277992512EB1D1EFEAA33FA1',
'Host': 'www.ccgp-hubei.gov.cn:9040',
'Origin': 'http://www.ccgp-hubei.gov.cn:9040',
'Referer': 'http://www.ccgp-hubei.gov.cn:9040/quSer/searchXmgg.html',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0(Macintosh;IntelMacOSX10_14_6)AppleWebKit/537.36(KHTML,'
'likeGecko)Chrome/91.0.4472.101Safari/537.36 '
}
params = {
'queryInfo.type': 'xmgg',
'queryInfo.key': '',
'queryInfo.jhhh': '',
'queryInfo.fbr': '',
'queryInfo.gglx': '中标(成交结果)公告',
'queryInfo.cglx': '',
'queryInfo.cgfs': '',
'queryInfo.city': '孝感市',
'queryInfo.qybm': '4209??',
'queryInfo.district': '全市',
'queryInfo.cgr': '',
'queryInfo.begin': '2017/12/01',
'queryInfo.end': '2021/06/23',
'queryInfo.pageNo': PageIndex,
'queryInfo.pageSize': '1000',
'queryInfo.pageTotle': '5'
}
BaseUrl = 'http://www.ccgp-hubei.gov.cn:9040/quSer/search'
res = requests.post(url=BaseUrl, data=params, headers=header)
# 解析html
soup = BeautifulSoup(res.text, features="html.parser")
urls = soup.findAll("a", {'target': '_blank'}) # 抓取a标签
for url in urls:
url = url.get('href')
allNewsUrl.append(url)
# 招标信息,要循环四千多次
def get_BiddingInfo():
for url in allNewsUrl:
res = requests.get(url)
res.encoding = "utf-8"
soup = BeautifulSoup(res.text, features="html.parser")
proj = soup.findAll("span", {'style': 'font-family:微软雅黑;font-weight:normal;font-size:24px'})
projName = proj[0].string
company = soup.findAll("p", {
'style': 'margin-top: 0; margin-bottom:0;text-align:justify;line-height:200%;font-family:微软雅黑;font-size:16px;'})
for tag in company:
text = tag.findAll("span")
if len(text) != 0 and text[0].string == "供应商名称:":
companyName = text[1].string
info = Info()
info.companyName = companyName
info.projName = projName
allInfo.append(info)
# 貌似设置了一个每页最大显示数据量,所以一次拿不到全部数据,分五次来拿
for index in range(1, 6):
get_AllNews(index)
get_BiddingInfo()
file = open("result.txt", "w")
for info in allInfo:
file.write("项目名字:" + info.projName + "\n供应商名字:" + info.companyName + "\n")
结果