爬取政务服务文件地址，解析文件地址，对每个文件中次数过6的内容分词进行提取（第二个代码，可换用requests，Beautifulsoup对网址的请求和解析，运行时间会快一点）

最新推荐文章于 2024-09-17 23:15:58 发布

spider文

最新推荐文章于 2024-09-17 23:15:58 发布

阅读量342

点赞数 7

文章标签：政务 python 开发语言后端

本文链接：https://blog.csdn.net/m0_73639297/article/details/138257818

版权

本文介绍了如何使用Selenium库配合ChromeOptions进行网页抓取，包括模拟登录、数据解析以及使用jieba进行中文分词。同时展示了如何处理反爬虫策略，最后将抓取的数据保存到CSV文件中。

摘要由CSDN通过智能技术生成

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver import ActionChains
import gc
import csv
from matplotlib import rcParams  ## run command settings for plotting

config = {
    "mathtext.fontset":'stix',
    "font.family":'serif',
    "font.serif": ['SimHei'],
    "font.size": 10,   # 字号，大家自行调节
    'axes.unicode_minus': False # 处理负号，即-号
}
rcParams.update(config)  ## 设置画图的一些参数
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 屏蔽保存密码提示框
prefs = {'credentials_enable_service': False, 'profile.password_manager_enabled': False}
options.add_experimental_option('prefs', prefs)
# 反爬虫特征处理
options.add_argument('--disable-blink-features=AutomationControlled')
web = webdriver.Chrome(options=options)
# 2. 打开淘宝网
web.get('https://www.echinagov.com')
web.maximize_window()
web.find_element('css selector','body > header > div.tapcontent.wrap.clearfix > form > div > div.input-group > input').send_keys('政务服务问答系统开发的难点')
web.find_element('css selector','body > header > div.tapcontent.wrap.clearfix > form > div > div.input-group > button').click()
with open('zhengwu.csv','w',newline='',encoding='utf-8')as file:
    writer=csv.writer(file)
    writer.writerow(['链接','标题'])
    for i in range(1,2):
        infos=web.find_elements('css selector','body > div.main.wrap.clearfix > div.content-left-wapper.l > div.list > div')
        for info in infos:
            link=info.find_element('css selector','a').get_attribute('href')
            print(link)
            text=info.find_element('css selector','a').get_attribute('title')
            print(text)
            writer.writerow([link,text])
        web.find_element('css selector','body > div.main.wrap.clearfix > div.content-left-wapper.l > div > div.pageNum > div > a.next').click()
print('文件已保存')

对网址进行解析

数据分词处理 ----- 此次处理数据较少，有兴趣的可加大爬取的页数，代码中只爬取了两页数据

from selenium import webdriver
import csv
from collections import Counter
import jieba

from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver import ActionChains
# 存储所有文章内容的列表
all_contents = []

# 用于统计词频的计数器
word_count = Counter()

# 阈值，表示词在所有文章中至少出现的次数
threshold = 5

# CSV文件路径
csv_file_path = 'zhengwu.csv'
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 屏蔽保存密码提示框
prefs = {'credentials_enable_service': False, 'profile.password_manager_enabled': False}
options.add_experimental_option('prefs', prefs)
# 反爬虫特征处理
options.add_argument('--disable-blink-features=AutomationControlled')
web = webdriver.Chrome(options=options)
# 打开CSV文件并读取数据
with open(csv_file_path, mode='r', encoding='utf-8') as csvfile:
    csv_reader = csv.DictReader(csvfile)
    
    # 遍历每一行数据（文章网址和标题）
    for row in csv_reader:
        # 获取文章内容的请求URL
        url = row['链接']
        title = row['标题']
        print(title)
        if title=='贵阳：一“增”一“减”破解政务服务难点':
            break
        else:
        # 请求文章内容
          web.get(url)
          content = web.find_element('css selector','body > div.main.wrap.clearfix > div.content-left-wapper.l > article > div.article-body > div').text
        
        # 使用jieba进行中文分词
        words = jieba.lcut(content)
        all_contents.extend(words)

# 统计所有词出现的次数
word_count.update(all_contents)

# 筛选出出现次数超过阈值的词
# 筛选出出现次数超过阈值的词
repeated_content = [word for word, count in word_count.items() if count > threshold]


# 将反复出现的内容保存到新的CSV文件中
with open('repeated_content.csv', mode='w', encoding='utf-8', newline='') as csvfile:
    writer = csv.writer(csvfile)
    # 写入标题行
    writer.writerow(['反复出现的内容'])
    
    # 遍历所有反复出现的内容并写入
    for item in repeated_content:
        print(item)
        writer.writerow([item])