python爬取国家标准文件

最新推荐文章于 2025-04-01 23:35:43 发布
pumpkin0_0
最新推荐文章于 2025-04-01 23:35:43 发布
阅读量2.1k
点赞数 23
文章标签： python pycharm 爬虫数据库 selenium
本文链接：https://blog.csdn.net/pumpkin_0/article/details/141817611
版权
爬取网站：国家标准文件公开系统

页面类型总结

1.翻页爬取接口
2.数据列表对页面爬取
3.爬取接口获得行业类型
4.文件下载页面需要图片验证，模拟浏览器完成验证
准备工作

下载需要的库，下载文件的文件夹和数据库表的建立
使用selenium要注意webdriver版本和谷歌浏览器版本的对应
完整代码

import os
import requests
from bs4 import BeautifulSoup
import time
import ddddocr
from PIL import Image  # 用于打开图片和对图片处理
from selenium import webdriver
from selenium.webdriver.common.by import By
import uuid
from selenium.webdriver.common.desired_capabilities import DesiredCapabilities
import pymysql as mysql
import datetime

# 消除警告
requests.packages.urllib3.disable_warnings()
cookies = {
    'Hm_lvt_50758913e6f0dfc9deacbfebce3637e4': '1724468921',
    'Hm_lpvt_50758913e6f0dfc9deacbfebce3637e4': '1724804398',
    'JSESSIONID': '133D41B3608C62C40311889428F04115',
    'Hm_lvt_54db9897e5a65f7a7b00359d86015d8d': '1724467421',
    'HMACCOUNT': 'DCC1FF752F2805EB',
    'Hm_lpvt_54db9897e5a65f7a7b00359d86015d8d': '1724467636',
}

headers = {
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,en-GB;q=0.7,en-US;q=0.6',
    'Connection': 'keep-alive',
    # 'Cookie': 'Hm_lvt_50758913e6f0dfc9deacbfebce3637e4=1724468921; Hm_lpvt_50758913e6f0dfc9deacbfebce3637e4=1724804398; JSESSIONID=133D41B3608C62C40311889428F04115; Hm_lvt_54db9897e5a65f7a7b00359d86015d8d=1724467421; HMACCOUNT=DCC1FF752F2805EB; Hm_lpvt_54db9897e5a65f7a7b00359d86015d8d=1724467636',
    'Referer': 'https://openstd.samr.gov.cn/bzgk/gb/std_list_type?r=0.2345047468490511&page=17&pageSize=10&p.p1=1&p.p6=11&p.p90=circulation_date&p.p91=desc',
    'Sec-Fetch-Dest': 'document',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-User': '?1',
    'Upgrade-Insecure-Requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36 Edg/127.0.0.0',
    'sec-ch-ua': '"Not)A;Brand";v="99", "Microsoft Edge";v="127", "Chromium";v="127"',
    'sec-ch-ua-mobile': '?0',
    'sec-ch-ua-platform': '"Windows"',
}

params = {
    'r': '0.3488948163810215',
    'page': '16',
    'pageSize': '10',
    # 'p.p1' 1强制性国家标准 2推荐性国家标准
    'p.p1': '1',
    # 'p.p6'分类
    'p.p6': '11',
    'p.p90': 'circulation_date',
    'p.p91': 'desc',
}
data = {
    'pcode': '-1',
    'p.p1': '1',
    'p.p2': '',
    'p.p5': '',
    'p.p7': '',
    'p.p8': '',
}
# 本地数据库
con = mysql.connect(host="127.0.0.1", port=3306, user="root", passwd="root", db="库名", charset="utf8")


def inputdb(title, source_href, ddate, date2, fileno, id, attachment, industry, params_p1):
    global con
    cursor1 = con.cursor()
    public_time = ddate
    create_time = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    source = '国家标准'
    if params_p1 == 1:
        source = "强制性" + source
    if params_p1 == 2:
        source = "推荐性" + source
    # 标题相同 去重操作
    sql = "select * from 表名 where title ='%s'" % (title)
    cursor1.execute(sql)
    results = cursor1.fetchall()
    if len(results) > 0:
        print('The data already exists---')
        cursor1.close()
        return
    cursor1.close()
    cursor2 = con.cursor()
    cursor2.execute(
        "insert into 表名(id,title,source,source_href,public_time,expiry_time,create_time,fileno,attachment,industry)"
        "values('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (
            id, title, source, source_href, public_time, date2, create_time, fileno, attachment, industry))
    con.commit()
    cursor2.close()


# 雪花算法获取uid
class Snowflake:
    def __init__(self, machine_id):
        self.machine_id = machine_id
        self.sequence = 0
        self.last_timestamp = -1

    def generate_id(self):
        timestamp = int(time.time() * 1000)
        if timestamp < self.last_timestamp:
            raise Exception("Clock moved backwards")
        if timestamp == self.last_timestamp:
            self.sequence = (self.sequence + 1) & 4095
            if self.sequence == 0:
                timestamp = self.wait_next_millis(self.last_timestamp)
        else:
            self.sequence = 0
        self.last_timestamp = timestamp
        return ((timestamp - 1288834974657) << 22) | (self.machine_id << 12) | self.sequence

    def wait_next_millis(self, last_timestamp):
        timestamp = int(time.time() * 1000)
        while timestamp <= last_timestamp:
            timestamp = int(time.time() * 1000)
        return timestamp


# 模拟浏览器操作 进行图片验证 完成文件下载
class VerificationCode:
    def __init__(self):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_experimental_option('excludeSwitches', ['enable-automation'])
        prefs = {'download.prompt_for_download': False}  # 不弹出浏览器 下载文件保留对话框
        chrome_options.add_experimental_option("prefs", prefs)
        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.command_executor._commands["send_command"] = ("POST", '/session/$sessionId/chromium/send_command')
        params = {'cmd': 'Page.setDownloadBehavior',
                  'params': {'behavior': 'allow',
                             'downloadPath': 'D:\\spiderFiles\\'}}
        self.driver.execute("send_command", params)
        # 等待超时
        desired_capabilities = DesiredCapabilities.CHROME
        desired_capabilities["pageLoadStrategy"] = "none"

    def get_pictures(self, href):
        self.driver.get(href)  # 打开登陆页面
        link = self.driver.find_element(By.XPATH, '//a[@href="javascript:download();void(0);"]')
        link.click()
        self.driver.maximize_window()
        time.sleep(5)
        self.driver.save_screenshot('pictures.png')  # 全屏截图
        page_snap_obj = Image.open('pictures.png')
        img = self.driver.find_element(By.XPATH, '//img[@class="verifyCode"]')  # 验证码元素位置
        location = img.location
        size = img.size  # 获取验证码的大小参数
        left = location['x']
        top = location['y']
        right = left + size['width']
        bottom = top + size['height']
        image_obj = page_snap_obj.crop((left, top, right, bottom))  # 按照验证码的长宽，切割验证码
        image_obj.show()  # 打开切割后的完整验证码
        return image_obj

    def image_str(self, href, name, f_name):
        try:
            image = self.get_pictures(href)
            orc = ddddocr.DdddOcr()
            text = orc.classification(image)
            print("识别结果：", text)
            # 输入验证码并提交表单
            captcha_input = self.driver.find_element(By.XPATH, "//input[@id='verifyCode']")
            captcha_input.send_keys(text)
            time.sleep(3)
            download_button = self.driver.find_element(By.XPATH, '//button[@class="btn btn-primary"]')
            download_button.click()
            print('验证成功')
            time.sleep(60)  # 给出下载文件的时间
            os.rename(os.path.join('D:\\spiderFiles\\', f_name),
                      os.path.join('D:\\spiderFiles\\', name))
            return text
        except FileNotFoundError:
            print('File not found')
        self.driver.close()
        self.driver.quit()


def download_pdf(source_href, href_id, title, date, date2, fileno, industry, id, params_p1):
    name = str(uuid.uuid4().hex) + '.pdf'  # 重命名文件名为唯一标识uuid
    attachment_url = '/spiderFiles/' + name  # 入库的下载文件路径
    href = 'http://c.gb688.cn/bzgk/gb/showGb?type=download&hcno=' + href_id
    f_names = fileno.split(' ')
    f_name = str(f_names[0] + '+' + f_names[1]) + '.pdf'  # 获取文件原始的文件名
    a = VerificationCode()
    a.image_str(href, name, f_name)
    print(title, '---下载完成---', name)
    inputdb(title, source_href, date, date2, fileno, id, attachment_url, industry, params_p1)
    time.sleep(5)


if __name__ == '__main__':
    # 获取行业列表的内容
    res_list = requests.post('https://openstd.samr.gov.cn/bzgk/gb/ajaxIcsList',
                             cookies=cookies, headers=headers, data=data, verify=False).json()
    # 行业的列表数字不是连续的 列出来如下：
    industry_lists = ['01', '03', '07', '11', '13', '17', '19', '21', '23', '25', '27', '29',
                      '31', '33', '35', '37', '39', '43', '45', '47', '49', '53', '55', '59', '61', '65',
                      '67', '71', '73', '75', '77', '79', '81', '83', '85', '87', '91', '93', '97']
    # 循环参数的两个值，分别为强制性标准和推荐性标准
    for params_p1 in range(1, 3):
        # 循环行业的列表数字
        for list in industry_lists:
            industry = ''
            # 循环行业列表的内容
            for j in res_list:
                i_name = j['icsName']
                i_code = j['icsCode']
                if i_code == list:
                    industry = i_name
                    break
            # 循环页码 120页为最大页数 少于120页的 到最后一页自动退出循环了
            for i in range(1, 120):
                params['p.p1'] = params_p1
                params['p.p6'] = list
                params['page'] = i
                response = requests.get(
                    'https://openstd.samr.gov.cn/bzgk/gb/std_list_type',
                    params=params,
                    cookies=cookies,
                    headers=headers,
                    verify=False
                ).text
                # 雪花算法 获取id
                snowflake = Snowflake(1)
                # 解析下载列表 的页面
                soup = BeautifulSoup(response, 'html.parser')
                table = soup.find('table', class_='table result_list table-striped table-hover')
                tr_list = table.find_all('tr')
                del tr_list[0]
                for tr in tr_list:
                    # 废止的文件 不能下载 直接跳过本次循环
                    if '废止' in tr.find_all('td')[4].get_text():
                        continue
                    title = tr.find_all('td')[3].get_text().strip()
                    href_id = tr.find('a').get('onclick').split("'")[1]
                    a = VerificationCode()
                    source_href = 'https://openstd.samr.gov.cn/bzgk/gb/newGbInfo?hcno=' + href_id
                    a.driver.get(source_href)
                    # 此页面有‘下载标准’字样 才能下载文件
                    if '下载标准' not in a.driver.page_source:
                        continue
                    fileno = tr.find_all('td')[1].find('a').get_text()
                    date = tr.find_all('td')[5].get_text().strip()
                    date2 = tr.find_all('td')[6].get_text().strip()
                    j = tr.find('td').get_text()
                    # 数据库id 非自增
                    id = snowflake.generate_id()
                    print(industry)
                    print('第', i, '页-第', j, '条')
                    download_pdf(source_href, href_id, title, date, date2, fileno, industry, id, params_p1)
                    print('下载完成')
    con.close()