爬虫实战_全国公共资源

 仅作练习,侵权联系删除

提取的信息包括:标题,时间,正文链接和正文

代码

import httpx
from urllib.parse import urljoin
import time
from bs4 import BeautifulSoup
import json

cookies = {
    'JSESSIONID': '2e1735d2d60d4d4a2fe985c3b278',
    'insert_cookie': '61459989',
}

headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Origin': 'http://deal.ggzy.gov.cn',
    'Referer': 'http://deal.ggzy.gov.cn/ds/deal/dealList.jsp',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
}

def make_request(client, url, headers, method='GET', data=None):
    max_retries = 3
    retries = 0
    while retries < max_retries:
        try:
            if method == 'GET':
                response = client.get(url, headers=headers)
            elif method == 'POST':
                response = client.post(url, headers=headers, data=data)
            response.raise_for_status()
            return response
        except httpx.RemoteProtocolError:
            retries += 1
            print(f"Attempt {retries} failed, retrying...")
            time.sleep(2)
    raise Exception("Max retries reached. Request failed.")

results = []

with httpx.Client(verify=False, timeout=30) as client:
    for i in range(1, 3):  # 循环两页
        data = {
            'TIMEBEGIN_SHOW': '2024-08-12',
            'TIMEEND_SHOW': '2024-08-21',
            'TIMEBEGIN': '2024-08-12',
            'TIMEEND': '2024-08-21',
            'SOURCE_TYPE': '1',
            'DEAL_TIME': '02',
            'DEAL_CLASSIFY': '00',
            'DEAL_STAGE': '0000',
            'DEAL_PROVINCE': '0',
            'DEAL_CITY': '0',
            'DEAL_PLATFORM': '0',
            'BID_PLATFORM': '0',
            'DEAL_TRADE': '0',
            'isShowAll': '1',
            'PAGENUMBER': i,
            'FINDTXT': '',
        }

        post_url = 'http://deal.ggzy.gov.cn/ds/deal/dealList_find.jsp'
        response = make_request(client, post_url, headers, method='POST', data=data)
        items = response.json().get('data', [])

        base_url = 'https://www.ggzy.gov.cn'

        for item in items:
            title = item['title']
            timeShow = item['timeShow']
            url = item['url']

            full_url = urljoin(base_url, url)

            if full_url.startswith("http://"):
                full_url = full_url.replace("http://", "https://")

            if 'https://www.ggzy.gov.cn/information/html/a' in full_url:
                full_url = full_url.replace('https://www.ggzy.gov.cn/information/html/a', 'https://www.ggzy.gov.cn/information/html/b')

            headers1 = {
                'Referer': 'https://www.ggzy.gov.cn/information/html/a/440000/0203/202408/21/0044970b651e3c1946ce884a41342ac6d9d8.shtml',
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
                'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
                'sec-ch-ua-mobile': '?0',
                'sec-ch-ua-platform': '"Windows"',
            }

            try:
                res = make_request(client, full_url, headers1)
                html_content = res.text
                soup = BeautifulSoup(html_content, 'html.parser')

                # 提取所有文本信息
                all_text = soup.get_text(separator='\n', strip=True)

                # 保存到结果列表
                results.append({
                    'title': title,
                    'timeShow': timeShow,
                    'url': full_url,
                    'text': all_text
                })

            except Exception as e:
                print(f"Failed to retrieve {full_url}: {e}")

            time.sleep(2)

# 将结果保存到 JSON 文件
with open('results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print("数据已保存到 results.json")

成果展示

分析

请求库:httpx

数据提取:bs4

直接发请求,得到json数据,提取数据

得到标题,时间和链接

对链接直接发请求是没有数据的

虽然能打开页面

页面中也有数据

但是代码中请求过来,是没有数据的

经过观察

前面提取的链接

https://www.ggzy.gov.cn/information/html/a/320000/0101/202408/19/0032fc7b5bd9728c48cb8a95dc2f52bdf9a5.shtml

 有数据的链接

https://www.ggzy.gov.cn/information/html/b/320000/0101/202408/19/0032fc7b5bd9728c48cb8a95dc2f52bdf9a5.shtml

 其实就是链接中,由a变成了b

在代码中对链接做处理

最后详情页使用bs4提取全部的文本内容

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值