仅作练习,侵权联系删除
提取的信息包括:标题,时间,正文链接和正文
代码
import httpx
from urllib.parse import urljoin
import time
from bs4 import BeautifulSoup
import json
cookies = {
'JSESSIONID': '2e1735d2d60d4d4a2fe985c3b278',
'insert_cookie': '61459989',
}
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'Origin': 'http://deal.ggzy.gov.cn',
'Referer': 'http://deal.ggzy.gov.cn/ds/deal/dealList.jsp',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'X-Requested-With': 'XMLHttpRequest',
}
def make_request(client, url, headers, method='GET', data=None):
max_retries = 3
retries = 0
while retries < max_retries:
try:
if method == 'GET':
response = client.get(url, headers=headers)
elif method == 'POST':
response = client.post(url, headers=headers, data=data)
response.raise_for_status()
return response
except httpx.RemoteProtocolError:
retries += 1
print(f"Attempt {retries} failed, retrying...")
time.sleep(2)
raise Exception("Max retries reached. Request failed.")
results = []
with httpx.Client(verify=False, timeout=30) as client:
for i in range(1, 3): # 循环两页
data = {
'TIMEBEGIN_SHOW': '2024-08-12',
'TIMEEND_SHOW': '2024-08-21',
'TIMEBEGIN': '2024-08-12',
'TIMEEND': '2024-08-21',
'SOURCE_TYPE': '1',
'DEAL_TIME': '02',
'DEAL_CLASSIFY': '00',
'DEAL_STAGE': '0000',
'DEAL_PROVINCE': '0',
'DEAL_CITY': '0',
'DEAL_PLATFORM': '0',
'BID_PLATFORM': '0',
'DEAL_TRADE': '0',
'isShowAll': '1',
'PAGENUMBER': i,
'FINDTXT': '',
}
post_url = 'http://deal.ggzy.gov.cn/ds/deal/dealList_find.jsp'
response = make_request(client, post_url, headers, method='POST', data=data)
items = response.json().get('data', [])
base_url = 'https://www.ggzy.gov.cn'
for item in items:
title = item['title']
timeShow = item['timeShow']
url = item['url']
full_url = urljoin(base_url, url)
if full_url.startswith("http://"):
full_url = full_url.replace("http://", "https://")
if 'https://www.ggzy.gov.cn/information/html/a' in full_url:
full_url = full_url.replace('https://www.ggzy.gov.cn/information/html/a', 'https://www.ggzy.gov.cn/information/html/b')
headers1 = {
'Referer': 'https://www.ggzy.gov.cn/information/html/a/440000/0203/202408/21/0044970b651e3c1946ce884a41342ac6d9d8.shtml',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
try:
res = make_request(client, full_url, headers1)
html_content = res.text
soup = BeautifulSoup(html_content, 'html.parser')
# 提取所有文本信息
all_text = soup.get_text(separator='\n', strip=True)
# 保存到结果列表
results.append({
'title': title,
'timeShow': timeShow,
'url': full_url,
'text': all_text
})
except Exception as e:
print(f"Failed to retrieve {full_url}: {e}")
time.sleep(2)
# 将结果保存到 JSON 文件
with open('results.json', 'w', encoding='utf-8') as f:
json.dump(results, f, ensure_ascii=False, indent=4)
print("数据已保存到 results.json")
成果展示
分析
请求库:httpx
数据提取:bs4
直接发请求,得到json数据,提取数据
得到标题,时间和链接
对链接直接发请求是没有数据的
虽然能打开页面
页面中也有数据
但是代码中请求过来,是没有数据的
经过观察
前面提取的链接
有数据的链接
其实就是链接中,由a变成了b
在代码中对链接做处理
最后详情页使用bs4提取全部的文本内容