使用selenium获取中华人民共和国宪法前100页法律法规文件链接-CSDN博客

本文链接：https://blog.csdn.net/m0_73639297/article/details/138167397

# 导入Selenium的webdriver模块
from selenium import webdriver
# 导入webdriver的Chrome选项配置模块
from selenium.webdriver.chrome.options import Options
# 导入time模块用于控制等待时间
import time
# 导入csv模块用于写入CSV文件
import csv
# 导入matplotlib的rcParams模块用于设置绘图参数
from matplotlib import rcParams

# 设置matplotlib绘图参数，包括字体、字号等
config = {
    "mathtext.fontset": 'stix',
    "font.family": 'serif',
    "font.serif": ['SimHei'],
    "font.size": 10,
    'axes.unicode_minus': False
}
rcParams.update(config)

# 创建Chrome浏览器选项对象，用于配置浏览器行为
options = webdriver.ChromeOptions()
# 添加实验性选项，以禁用自动化检测
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 屏蔽保存密码提示框
prefs = {'credentials_enable_service': False, 'profile.password_manager_enabled': False}
options.add_experimental_option('prefs', prefs)
# 添加选项，以禁用Blink特性，避免被识别为自动化脚本
options.add_argument('--disable-blink-features=AutomationControlled')

# 启动Chrome浏览器实例
web = webdriver.Chrome(options=options)
# 打开指定的网页
web.get('https://flk.npc.gov.cn')
# 最大化浏览器窗口
web.maximize_window()

# 在搜索框中输入关键词，这里输入的是“中华人民共和国宪法”
web.find_element('css selector', '#flfgTitle').send_keys('中华人民共和国宪法')
# 点击搜索按钮
web.find_element('css selector', 'body > div.banner > div.find1 > ul > li:nth-child(2) > ul > li:nth-child(1) > i').click()

# 打开一个CSV文件用于存储抓取的数据，如果文件不存在则创建，写入模式为写入
with open('bigdata.csv', 'w', newline='', encoding='utf-8') as file:
    # 创建一个csv写入器
    writer = csv.writer(file)
    # 写入表头，包括链接、信息、制定机关、法律性质、时效性和公布日期
    writer.writerow(['链接', '信息', '制定机关', '法律性质', '时效性', '公布日期'])
    
    # 设置一个循环，用于遍历抓取数据的页面，这里设置为最多100页
    for i in range(1, 100):
        # 查找页面中的所有表格行元素
        infos = web.find_elements('css selector', '#flData > tr')
        for info in infos:
            # 获取每个表格行中的第一个单元格中的链接属性
            link1 = info.find_element('css selector', '.l-sx .l-wen').get_attribute('onclick')
            # 构造完整的链接地址
            link = "https://flk.npc.gov.cn" + link1.replace("showDetail('.", "").replace("')", "")
            # 打印链接地址
            print(link, end='  ')
            # 获取每个表格行中的第一个单元格中的文本内容
            text = info.find_element('css selector', '.l-sx .l-wen ').text
            # 打印文本内容
            print(text, end='  ')
            # 获取每个表格行中的第二个单元格中的文本内容
            author = info.find_element('css selector', '.l-sx2 .l-wen1 ').text
            # 打印制定机关
            print(author, end='  ')
            # 获取每个表格行中的第三个单元格中的文本内容
            sx3 = info.find_elements('css selector', '.l-sx3 .l-wen1 ')
            root = sx3[0].text
            # 打印法律性质
            print(root)

            writer.writerow([link,text,author,root,timeuse,date])
        web.find_element('css selector','#layui-laypage-'+str(i)+' > a.layui-laypage-next').click()
        web.implicitly_wait(10)
        time.sleep(1)
print('数据已保存')
web.quit()