仅作练习,侵权联系删除
分析
通过观察数据就在页面中
使用request对电能e招采平台发送请求
接着使用xpath提取数据
html = response.text
tree = etree.HTML(html)
announcements = []
for li in tree.xpath('//ul[@class="newslist"]/li'):
title = li.xpath('.//h5/text()')[0].strip()
link = li.xpath('.//a/@href')[0]
bid_number = li.xpath('.//dd[@class="col"]/span/text()')[0]
bid_method = li.xpath('.//dd[@class="col"]/span/text()')[1]
deadline = li.xpath('.//dd[@class="col"]/span/text()')[2]
publish_date = li.xpath('.//div[@class="newsDate"]/div/text()')[0]
得到标题,链接,招标编号,招标方式,报名截止时间,发布时间
在浏览器的网站框打开上面得到的链接,使用xpath提取出src
继续在网站框中输入刚刚得到的url
通过观察下面这个像pdf文件
对他发起请求
携带以下参数
通过观察id就是
提取出id
# 提取 file 参数的值
query_params = parse_qs(parsed_url.query)
file_url = query_params.get('file', [None])[0]
if file_url:
# 从 file 参数的 URL 中提取 ID
parsed_file_url = urlparse(file_url)
file_query_params = parse_qs(parsed_file_url.query)
file_id = file_query_params.get('id', [None])[0]
print(f"提取的 ID: {file_id}")
保存为pdf
# 检查响应内容类型
content_type = response.headers.get('Content-Type')
# print(f"Content-Type: {content_type}")
if 'application/pdf' in content_type:
# 保存 PDF 文件到本地
sanitized_title = sanitize_filename(str(a))
pdf_filename = os.path.join(pdf_directory, f'{sanitized_title}.pdf')
# print(f"准备保存 PDF 文件到: {pdf_filename}")
with open(pdf_filename, 'wb') as file:
file.write(response.content)
# print(f"PDF 文件已保存为 {pdf_filename}")
else:
print("下载的文件不是 PDF 文件。")
代码
保存为csv文件
文件1.py
提取标题,链接,招标编号,招标方式,报名截止时间,发布时间,保存到csv
import requests
from lxml import etree
import csv
cookies = {
'JSESSIONID': '9A947E54626C762039BDC9FE3ED98DC1',
'_uab_collina': '172378604942791863140555',
'SESSION': 'OGIxNWFlZjEtMmFiZC00ODM5LTk3NWMtOTM3Yjk1OGZiYjU1',
'JSESSIONID': '9A947E54626C762039BDC9FE3ED98DC1',
'Successfully': '',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Referer': 'https://ebid.espic.com.cn/newgdtcms//category/demo2.html?dates=300&categoryId=2&tenderMethod=01&tabName=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&page=1',
'Sec-Fetch-Dest': 'iframe',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
# 打开CSV文件,创建writer对象(追加模式)
with open('announcements.csv', 'a', newline='', encoding='utf-8') as f:
writer = None # 定义writer对象为None
for i in range(3):
params = {
'dates': '300',
'categoryId': '2',
'tenderMethod': '01',
'tabName': '',
'page': i,
'time': '2024-8-16',
}
response = requests.get('https://ebid.espic.com.cn/newgdtcms//category/iframe.html', params=params,
cookies=cookies, headers=headers)
html = response.text
tree = etree.HTML(html)
announcements = []
for li in tree.xpath('//ul[@class="newslist"]/li'):
title = li.xpath('.//h5/text()')[0].strip()
link = li.xpath('.//a/@href')[0]
bid_number = li.xpath('.//dd[@class="col"]/span/text()')[0]
bid_method = li.xpath('.//dd[@class="col"]/span/text()')[1]
deadline = li.xpath('.//dd[@class="col"]/span/text()')[2]
publish_date = li.xpath('.//div[@class="newsDate"]/div/text()')[0]
announcement = {
"title": title,
"link": link,
# 编号
"bid_number": bid_number,
# 招标方式
"bid_method": bid_method,
# 报名截止时间
"deadline": deadline,
# 发布时间
"publish_date": publish_date
}
announcements.append(announcement)
# 如果 writer 还未初始化,初始化 writer 并写入表头
if writer is None:
writer = csv.DictWriter(f, fieldnames=announcements[0].keys())
writer.writeheader()
# 写入当前页的数据
writer.writerows(announcements)
print("已成功将数据追加保存到 announcements.csv 文件")
结果展示
很明显看着非常乱
保存为json文件
文件2.py
import requests
from lxml import etree
import json
import os
cookies = {
'JSESSIONID': '9A947E54626C762039BDC9FE3ED98DC1',
'_uab_collina': '172378604942791863140555',
'SESSION': 'OGIxNWFlZjEtMmFiZC00ODM5LTk3NWMtOTM3Yjk1OGZiYjU1',
'JSESSIONID': '9A947E54626C762039BDC9FE3ED98DC1',
'Successfully': '',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Referer': 'https://ebid.espic.com.cn/newgdtcms//category/demo2.html?dates=300&categoryId=2&tenderMethod=01&tabName=%E6%8B%9B%E6%A0%87%E5%85%AC%E5%91%8A&page=1',
'Sec-Fetch-Dest': 'iframe',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
# 如果 JSON 文件已经存在,加载现有的数据
if os.path.exists('announcements.json'):
with open('announcements.json', 'r', encoding='utf-8') as f:
all_announcements = json.load(f)
else:
all_announcements = []
for i in range(3):
params = {
'dates': '300',
'categoryId': '2',
'tenderMethod': '01',
'tabName': '',
'page': i,
# 日期可能需要修改
'time': '2024-8-16',
}
response = requests.get('https://ebid.espic.com.cn/newgdtcms//category/iframe.html', params=params, cookies=cookies,
headers=headers)
html = response.text
tree = etree.HTML(html)
announcements = []
for li in tree.xpath('//ul[@class="newslist"]/li'):
title = li.xpath('.//h5/text()')[0].strip()
link = li.xpath('.//a/@href')[0]
bid_number = li.xpath('.//dd[@class="col"]/span/text()')[0]
bid_method = li.xpath('.//dd[@class="col"]/span/text()')[1]
deadline = li.xpath('.//dd[@class="col"]/span/text()')[2]
publish_date = li.xpath('.//div[@class="newsDate"]/div/text()')[0]
announcement = {
"title": title,
"link": link,
"bid_number": bid_number,
"bid_method": bid_method,
"deadline": deadline,
"publish_date": publish_date
}
print(announcement)
announcements.append(announcement)
# 将当前页的公告添加到总列表中
all_announcements.extend(announcements)
# 将所有公告保存为 JSON 文件
with open('announcements.json', 'w', encoding='utf-8') as f:
json.dump(all_announcements, f, ensure_ascii=False, indent=4)
print("已成功将数据追加保存到 announcements.json 文件")
结果展示
下载详情页pdf
自动创建文件夹pdf,下载的pdf保存到pdf文件夹中
pdf命名为12345。。。。
用json文件中的标题命名,可能是因为文件名过长,得到的pdf不全
文件3.py
import json
from urllib.parse import urlparse, parse_qs
from lxml import html
import requests
import os
import re
# 定义目录路径
pdf_directory = 'pdf'
# 创建目录(如果不存在的话)
if not os.path.exists(pdf_directory):
os.makedirs(pdf_directory)
def sanitize_filename(filename):
# 替换非法字符
return re.sub(r'[<>:"/\\|?*]', '_', filename)
cookies = {
'_uab_collina': '172378604942791863140555',
'SESSION': 'OGIxNWFlZjEtMmFiZC00ODM5LTk3NWMtOTM3Yjk1OGZiYjU1',
'JSESSIONID': '9A947E54626C762039BDC9FE3ED98DC1',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Cache-Control': 'max-age=0',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
# 读取 JSON 文件时指定编码
with open('announcements.json', 'r', encoding='utf-8') as file:
data = json.load(file)
print(data)
a =0
for i in data:
a=a+1
print(a)
print(i)
link = i['link']
print(link)
title = i['title']
response = requests.get(link, cookies=cookies, headers=headers)
tree = html.fromstring(response.content)
# 使用 XPath 提取 iframe 的 src 属性
iframe_src = tree.xpath('//iframe/@src')[0]
# print(iframe_src)
# 解析 URL
parsed_url = urlparse(iframe_src)
# 提取 file 参数的值
query_params = parse_qs(parsed_url.query)
file_url = query_params.get('file', [None])[0]
if file_url:
# 从 file 参数的 URL 中提取 ID
parsed_file_url = urlparse(file_url)
file_query_params = parse_qs(parsed_file_url.query)
file_id = file_query_params.get('id', [None])[0]
print(f"提取的 ID: {file_id}")
cookies = {
'_uab_collina': '172378604942791863140555',
'SESSION': 'OGIxNWFlZjEtMmFiZC00ODM5LTk3NWMtOTM3Yjk1OGZiYjU1',
'JSESSIONID': '9A947E54626C762039BDC9FE3ED98DC1',
}
headers = {
'Accept': '*/*',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
'Connection': 'keep-alive',
'Referer': f'https://ebid.espic.com.cn//resource/css/pdfjs/web/viewer.html?file=https://ebid.espic.com.cn/bidprocurement/datacenter-cebpubserver/cebpubserver/dataCeboubServerCommonController/openFileById?fileType%3D2%26id%3D{file_id}&page=1',
'Sec-Fetch-Dest': 'empty',
'Sec-Fetch-Mode': 'cors',
'Sec-Fetch-Site': 'same-origin',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36',
'sec-ch-ua': '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'fileType': '2',
'id': file_id,
}
response = requests.get(
'https://ebid.espic.com.cn/bidprocurement/datacenter-cebpubserver/cebpubserver/dataCeboubServerCommonController/openFileById',
params=params, cookies=cookies, headers=headers)
# 检查响应状态码
# print(f"Status Code: {response.status_code}")
# 打印前1000个字节以检查返回内容
# print(response.text[:1000])
# 检查响应内容类型
content_type = response.headers.get('Content-Type')
# print(f"Content-Type: {content_type}")
if 'application/pdf' in content_type:
# 保存 PDF 文件到本地
sanitized_title = sanitize_filename(str(a))
pdf_filename = os.path.join(pdf_directory, f'{sanitized_title}.pdf')
# print(f"准备保存 PDF 文件到: {pdf_filename}")
with open(pdf_filename, 'wb') as file:
file.write(response.content)
# print(f"PDF 文件已保存为 {pdf_filename}")
else:
print("下载的文件不是 PDF 文件。")
结果展示