爬取网站:国家标准文件公开系统
页面类型 总结
1.翻页爬取接口
2.数据列表对页面爬取
3.爬取接口获得行业类型
4.文件下载页面需要图片验证,模拟浏览器完成验证
准备工作
下载需要的库,下载文件的文件夹和数据库表的建立
使用selenium要注意webdriver版本和谷歌浏览器版本的对应
完整代码
import os
import requests
from bs4 import BeautifulSoup
import time
import ddddocr
from PIL import Image # 用于打开图片和对图片处理
from selenium import webdriver
from selenium.webdriver.common.by import By
import uuid
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pymysql as mysql
import datetime
# 消除警告
requests.packages.urllib3.disable_warnings()
cookies = {
'Hm_lvt_50758913e6f0dfc9deacbfebce3637e4': '1724468921',
'Hm_lpvt_50758913e6f0dfc9deacbfebce3637e4': '1724804398',
'JSESSIONID': '133D41B3608C62C40311889428F04115',
'Hm_lvt_54db9897e5a65f7a7b00359d86015d8d': '1724467421',
'HMACCOUNT': 'DCC1FF752F2805EB',
'Hm_lpvt_54db9897e5a65f7a7b00359d86015d8d': '1724467636',
}
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
'Connection': 'keep-alive',
# 'Cookie': 'Hm_lvt_50758913e6f0dfc9deacbfebce3637e4=1724468921; Hm_lpvt_50758913e6f0dfc9deacbfebce3637e4=1724804398; JSESSIONID=133D41B3608C62C40311889428F04115; Hm_lvt_54db9897e5a65f7a7b00359d86015d8d=1724467421; HMACCOUNT=DCC1FF752F2805EB; Hm_lpvt_54db9897e5a65f7a7b00359d86015d8d=1724467636',
'Referer': 'https://openstd.samr.gov.cn/bzgk/gb/std_list_type?r=0.2345047468490511&page=17&pageSize=10&p.p1=1&p.p6=11&p.p90=circulation_date&p.p91=desc',
'Sec-Fetch-Dest': 'document',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-User': '?1',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
'sec-ch-ua': '"Not)A;Brand";v="99", "Microsoft Edge";v="127", "Chromium";v="127"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
}
params = {
'r': '0.3488948163810215',
'page': '16',
'pageSize': '10',
# 'p.p1' 1强制性国家标准 2推荐性国家标准
'p.p1': '1',
# 'p.p6'分类
'p.p6': '11',
'p.p90': 'circulation_date',
'p.p91': 'desc',
}
data = {
'pcode': '-1',
'p.p1': '1',
'p.p2': '',
'p.p5': '',
'p.p7': '',
'p.p8': '',
}
# 本地数据库
con = mysql.connect(host="127.0.0.1", port=3306, user="root", passwd="root", db="库名", charset="utf8")
def inputdb(title, source_href, ddate, date2, fileno, id, attachment, industry, params_p1):
global con
cursor1 = con.cursor()
public_time = ddate
create_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
source = '国家标准'
if params_p1 == 1:
source = "强制性" + source
if params_p1 == 2:
source = "推荐性" + source
# 标题相同 去重操作
sql = "select * from 表名 where title ='%s'" % (title)
cursor1.execute(sql)
results = cursor1.fetchall()
if len(results) > 0:
print('The data already exists---')
cursor1.close()
return
cursor1.close()
cursor2 = con.cursor()
cursor2.execute(
"insert into 表名(id,title,source,source_href,public_time,expiry_time,create_time,fileno,attachment,industry)"
"values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (
id, title, source, source_href, public_time, date2, create_time, fileno, attachment, industry))
con.commit()
cursor2.close()
# 雪花算法获取uid
class Snowflake:
def __init__(self, machine_id):
self.machine_id = machine_id
self.sequence = 0
self.last_timestamp = -1
def generate_id(self):
timestamp = int(time.time() * 1000)
if timestamp < self.last_timestamp:
raise Exception("Clock moved backwards")
if timestamp == self.last_timestamp:
self.sequence = (self.sequence + 1) & 4095
if self.sequence == 0:
timestamp = self.wait_next_millis(self.last_timestamp)
else:
self.sequence = 0
self.last_timestamp = timestamp
return ((timestamp - 1288834974657) << 22) | (self.machine_id << 12) | self.sequence
def wait_next_millis(self, last_timestamp):
timestamp = int(time.time() * 1000)
while timestamp <= last_timestamp:
timestamp = int(time.time() * 1000)
return timestamp
# 模拟浏览器操作 进行图片验证 完成文件下载
class VerificationCode:
def __init__(self):
chrome_options = webdriver.ChromeOptions()
chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
prefs = {'download.prompt_for_download': False} # 不弹出浏览器 下载文件保留对话框
chrome_options.add_experimental_option("prefs", prefs)
self.driver = webdriver.Chrome(options=chrome_options)
self.driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
params = {'cmd': 'Page.setDownloadBehavior',
'params': {'behavior': 'allow',
'downloadPath': 'D:\\spiderFiles\\'}}
self.driver.execute("send_command", params)
# 等待超时
desired_capabilities = DesiredCapabilities.CHROME
desired_capabilities["pageLoadStrategy"] = "none"
def get_pictures(self, href):
self.driver.get(href) # 打开登陆页面
link = self.driver.find_element(By.XPATH, '//a[@href="javascript:download();void(0);"]')
link.click()
self.driver.maximize_window()
time.sleep(5)
self.driver.save_screenshot('pictures.png') # 全屏截图
page_snap_obj = Image.open('pictures.png')
img = self.driver.find_element(By.XPATH, '//img[@class="verifyCode"]') # 验证码元素位置
location = img.location
size = img.size # 获取验证码的大小参数
left = location['x']
top = location['y']
right = left + size['width']
bottom = top + size['height']
image_obj = page_snap_obj.crop((left, top, right, bottom)) # 按照验证码的长宽,切割验证码
image_obj.show() # 打开切割后的完整验证码
return image_obj
def image_str(self, href, name, f_name):
try:
image = self.get_pictures(href)
orc = ddddocr.DdddOcr()
text = orc.classification(image)
print("识别结果:", text)
# 输入验证码并提交表单
captcha_input = self.driver.find_element(By.XPATH, "//input[@id='verifyCode']")
captcha_input.send_keys(text)
time.sleep(3)
download_button = self.driver.find_element(By.XPATH, '//button[@class="btn btn-primary"]')
download_button.click()
print('验证成功')
time.sleep(60) # 给出下载文件的时间
os.rename(os.path.join('D:\\spiderFiles\\', f_name),
os.path.join('D:\\spiderFiles\\', name))
return text
except FileNotFoundError:
print('File not found')
self.driver.close()
self.driver.quit()
def download_pdf(source_href, href_id, title, date, date2, fileno, industry, id, params_p1):
name = str(uuid.uuid4().hex) + '.pdf' # 重命名文件名为唯一标识uuid
attachment_url = '/spiderFiles/' + name # 入库的下载文件路径
href = 'http://c.gb688.cn/bzgk/gb/showGb?type=download&hcno=' + href_id
f_names = fileno.split(' ')
f_name = str(f_names[0] + '+' + f_names[1]) + '.pdf' # 获取文件原始的文件名
a = VerificationCode()
a.image_str(href, name, f_name)
print(title, '---下载完成---', name)
inputdb(title, source_href, date, date2, fileno, id, attachment_url, industry, params_p1)
time.sleep(5)
if __name__ == '__main__':
# 获取行业列表的内容
res_list = requests.post('https://openstd.samr.gov.cn/bzgk/gb/ajaxIcsList',
cookies=cookies, headers=headers, data=data, verify=False).json()
# 行业的列表数字不是连续的 列出来如下:
industry_lists = ['01', '03', '07', '11', '13', '17', '19', '21', '23', '25', '27', '29',
'31', '33', '35', '37', '39', '43', '45', '47', '49', '53', '55', '59', '61', '65',
'67', '71', '73', '75', '77', '79', '81', '83', '85', '87', '91', '93', '97']
# 循环参数的两个值,分别为强制性标准和推荐性标准
for params_p1 in range(1, 3):
# 循环行业的列表数字
for list in industry_lists:
industry = ''
# 循环行业列表的内容
for j in res_list:
i_name = j['icsName']
i_code = j['icsCode']
if i_code == list:
industry = i_name
break
# 循环页码 120页为最大页数 少于120页的 到最后一页自动退出循环了
for i in range(1, 120):
params['p.p1'] = params_p1
params['p.p6'] = list
params['page'] = i
response = requests.get(
'https://openstd.samr.gov.cn/bzgk/gb/std_list_type',
params=params,
cookies=cookies,
headers=headers,
verify=False
).text
# 雪花算法 获取id
snowflake = Snowflake(1)
# 解析下载列表 的页面
soup = BeautifulSoup(response, 'html.parser')
table = soup.find('table', class_='table result_list table-striped table-hover')
tr_list = table.find_all('tr')
del tr_list[0]
for tr in tr_list:
# 废止的文件 不能下载 直接跳过本次循环
if '废止' in tr.find_all('td')[4].get_text():
continue
title = tr.find_all('td')[3].get_text().strip()
href_id = tr.find('a').get('onclick').split("'")[1]
a = VerificationCode()
source_href = 'https://openstd.samr.gov.cn/bzgk/gb/newGbInfo?hcno=' + href_id
a.driver.get(source_href)
# 此页面有‘下载标准’字样 才能下载文件
if '下载标准' not in a.driver.page_source:
continue
fileno = tr.find_all('td')[1].find('a').get_text()
date = tr.find_all('td')[5].get_text().strip()
date2 = tr.find_all('td')[6].get_text().strip()
j = tr.find('td').get_text()
# 数据库id 非自增
id = snowflake.generate_id()
print(industry)
print('第', i, '页-第', j, '条')
download_pdf(source_href, href_id, title, date, date2, fileno, industry, id, params_p1)
print('下载完成')
con.close()