Python爬取Post请求

最新推荐文章于 2024-10-10 19:29:49 发布

Yuke213

最新推荐文章于 2024-10-10 19:29:49 发布

阅读量2.3k

点赞数 4

文章标签： python 爬虫

本文链接：https://blog.csdn.net/Yuke213/article/details/118151913

版权

该博客讲述了如何使用Python的BeautifulSoup4和requests库爬取湖北政府采购网的中标公告信息。首先获取所有新闻链接，然后遍历每个链接提取项目名称和中标公司名称，并将结果保存到result.txt文件中。整个过程分为五次请求以获取所有数据。

摘要由CSDN通过智能技术生成

一个很简单的信息公示网站上面的新闻。
请求方式是Post，参数信息直接去网站上面请求一次，然后复制过来就行了。

数据解析用了BeautifulSoup4框架，根据属性搜索相应的属性，简简单单

from bs4 import BeautifulSoup
import requests

allNewsUrl = []
pattern = "http.*html"
allInfo = []

class Info:
    projName = ""  # 项目名字
    companyName = ""  # 中标的公司名字

# 先把所有的新闻抓下来
def get_AllNews(PageIndex):
    header = {
        'Accept': 'text/html,application/xhtml+xml,application/xmlq=0.9,image/avif,image/webp,'
                  'image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'Accept-Encoding': 'gzip,deflate',
        'Accept-Language': 'zh-CN,zh;q=0.9',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive',
        'Content-Length': '450',
        'Content-Type': 'application/x-www-form-urlencoded',
        # cookie每天都会改变，要去网站上面复制
        'Cookie': 'JSESSIONID=F587B93718F44E84DB5C1919DFE9D9CF; JSESSIONID=19DD7636277992512EB1D1EFEAA33FA1',
        'Host': 'www.ccgp-hubei.gov.cn:9040',
        'Origin': 'http://www.ccgp-hubei.gov.cn:9040',
        'Referer': 'http://www.ccgp-hubei.gov.cn:9040/quSer/searchXmgg.html',
        'Upgrade-Insecure-Requests': '1',
        'User-Agent': 'Mozilla/5.0(Macintosh;IntelMacOSX10_14_6)AppleWebKit/537.36(KHTML,'
                      'likeGecko)Chrome/91.0.4472.101Safari/537.36 '
    }

    params = {
        'queryInfo.type': 'xmgg',
        'queryInfo.key': '',
        'queryInfo.jhhh': '',
        'queryInfo.fbr': '',
        'queryInfo.gglx': '中标（成交结果）公告',
        'queryInfo.cglx': '',
        'queryInfo.cgfs': '',
        'queryInfo.city': '孝感市',
        'queryInfo.qybm': '4209??',
        'queryInfo.district': '全市',
        'queryInfo.cgr': '',
        'queryInfo.begin': '2017/12/01',
        'queryInfo.end': '2021/06/23',
        'queryInfo.pageNo': PageIndex,
        'queryInfo.pageSize': '1000',
        'queryInfo.pageTotle': '5'
    }

    BaseUrl = 'http://www.ccgp-hubei.gov.cn:9040/quSer/search'
    res = requests.post(url=BaseUrl, data=params, headers=header)
    # 解析html
    soup = BeautifulSoup(res.text, features="html.parser")
    urls = soup.findAll("a", {'target': '_blank'})  # 抓取a标签
    for url in urls:
        url = url.get('href')
        allNewsUrl.append(url)


# 招标信息，要循环四千多次
def get_BiddingInfo():
    for url in allNewsUrl:
        res = requests.get(url)
        res.encoding = "utf-8"
        soup = BeautifulSoup(res.text, features="html.parser")
        proj = soup.findAll("span", {'style': 'font-family:微软雅黑;font-weight:normal;font-size:24px'})
        projName = proj[0].string
        company = soup.findAll("p", {
            'style': 'margin-top: 0; margin-bottom:0;text-align:justify;line-height:200%;font-family:微软雅黑;font-size:16px;'})
        for tag in company:
            text = tag.findAll("span")
            if len(text) != 0 and text[0].string == "供应商名称：":
                companyName = text[1].string
                info = Info()
                info.companyName = companyName
                info.projName = projName
                allInfo.append(info)


# 貌似设置了一个每页最大显示数据量，所以一次拿不到全部数据，分五次来拿
for index in range(1, 6):
    get_AllNews(index)
get_BiddingInfo()
file = open("result.txt", "w")
for info in allInfo:
    file.write("项目名字：" + info.projName + "\n供应商名字：" + info.companyName + "\n")