爬虫实战_全国公共资源

Guard757

于 2024-08-21 14:28:54 发布

阅读量229

点赞数 5

文章标签：爬虫 python json beautifulsoup

本文链接：https://blog.csdn.net/weixin_59498985/article/details/141392298

版权

仅作练习，侵权联系删除

提取的信息包括：标题，时间，正文链接和正文

代码

import httpx
from urllib.parse import urljoin
import time
from bs4 import BeautifulSoup
import json

cookies = {
    'JSESSIONID': '2e1735d2d60d4d4a2fe985c3b278',
    'insert_cookie': '61459989',
}

headers = {
    'Accept': 'application/json, text/javascript, */*; q=0.01',
    'Accept-Language': 'zh-CN,zh;q=0.9',
    'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
    'Origin': 'http://deal.ggzy.gov.cn',
    'Referer': 'http://deal.ggzy.gov.cn/ds/deal/dealList.jsp',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',
}

def make_request(client, url, headers, method='GET', data=None):
    max_retries = 3
    retries = 0
    while retries < max_retries:
        try:
            if method == 'GET':
                response = client.get(url, headers=headers)
            elif method == 'POST':
                response = client.post(url, headers=headers, data=data)
            response.raise_for_status()
            return response
        except httpx.RemoteProtocolError:
            retries += 1
            print(f"Attempt {retries} failed, retrying...")
            time.sleep(2)
    raise Exception("Max retries reached. Request failed.")

results = []

with httpx.Client(verify=False, timeout=30) as client:
    for i in range(1, 3):  # 循环两页
        data = {
            'TIMEBEGIN_SHOW': '2024-08-12',
            'TIMEEND_SHOW': '2024-08-21',
            'TIMEBEGIN': '2024-08-12',
            'TIMEEND': '2024-08-21',
            'SOURCE_TYPE': '1',
            'DEAL_TIME': '02',
            'DEAL_CLASSIFY': '00',
            'DEAL_STAGE': '0000',
            'DEAL_PROVINCE': '0',
            'DEAL_CITY': '0',
            'DEAL_PLATFORM': '0',
            'BID_PLATFORM': '0',
            'DEAL_TRADE': '0',
            'isShowAll': '1',
            'PAGENUMBER': i,
            'FINDTXT': '',
        }

        post_url = 'http://deal.ggzy.gov.cn/ds/deal/dealList_find.jsp'
        response = make_request(client, post_url, headers, method='POST', data=data)
        items = response.json().get('data', [])

        base_url = 'https://www.ggzy.gov.cn'

        for item in items:
            title = item['title']
            timeShow = item['timeShow']
            url = item['url']

            full_url = urljoin(base_url, url)

            if full_url.startswith("http://"):
                full_url = full_url.replace("http://", "https://")

            if 'https://www.ggzy.gov.cn/information/html/a' in full_url:
                full_url = full_url.replace('https://www.ggzy.gov.cn/information/html/a', 'https://www.ggzy.gov.cn/information/html/b')

            headers1 = {
                'Referer': 'https://www.ggzy.gov.cn/information/html/a/440000/0203/202408/21/0044970b651e3c1946ce884a41342ac6d9d8.shtml',
                'Upgrade-Insecure-Requests': '1',
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
                'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
                'sec-ch-ua-mobile': '?0',
                'sec-ch-ua-platform': '"Windows"',
            }

            try:
                res = make_request(client, full_url, headers1)
                html_content = res.text
                soup = BeautifulSoup(html_content, 'html.parser')

                # 提取所有文本信息
                all_text = soup.get_text(separator='\n', strip=True)

                # 保存到结果列表
                results.append({
                    'title': title,
                    'timeShow': timeShow,
                    'url': full_url,
                    'text': all_text
                })

            except Exception as e:
                print(f"Failed to retrieve {full_url}: {e}")

            time.sleep(2)

# 将结果保存到 JSON 文件
with open('results.json', 'w', encoding='utf-8') as f:
    json.dump(results, f, ensure_ascii=False, indent=4)

print("数据已保存到 results.json")