from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver import ActionChains
import gc
import csv
from matplotlib import rcParams ## run command settings for plotting
config = {
"mathtext.fontset":'stix',
"font.family":'serif',
"font.serif": ['SimHei'],
"font.size": 10, # 字号,大家自行调节
'axes.unicode_minus': False # 处理负号,即-号
}
rcParams.update(config) ## 设置画图的一些参数
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 屏蔽保存密码提示框
prefs = {'credentials_enable_service': False, 'profile.password_manager_enabled': False}
options.add_experimental_option('prefs', prefs)
# 反爬虫特征处理
options.add_argument('--disable-blink-features=AutomationControlled')
web = webdriver.Chrome(options=options)
# 2. 打开淘宝网
web.get('https://www.echinagov.com')
web.maximize_window()
web.find_element('css selector','body > header > div.tapcontent.wrap.clearfix > form > div > div.input-group > input').send_keys('政务服务问答系统开发的难点')
web.find_element('css selector','body > header > div.tapcontent.wrap.clearfix > form > div > div.input-group > button').click()
with open('zhengwu.csv','w',newline='',encoding='utf-8')as file:
writer=csv.writer(file)
writer.writerow(['链接','标题'])
for i in range(1,2):
infos=web.find_elements('css selector','body > div.main.wrap.clearfix > div.content-left-wapper.l > div.list > div')
for info in infos:
link=info.find_element('css selector','a').get_attribute('href')
print(link)
text=info.find_element('css selector','a').get_attribute('title')
print(text)
writer.writerow([link,text])
web.find_element('css selector','body > div.main.wrap.clearfix > div.content-left-wapper.l > div > div.pageNum > div > a.next').click()
print('文件已保存')
对网址进行解析
数据分词处理 ----- 此次处理数据较少,有兴趣的可加大爬取的页数,代码中只爬取了两页数据
from selenium import webdriver
import csv
from collections import Counter
import jieba
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver import ActionChains
# 存储所有文章内容的列表
all_contents = []
# 用于统计词频的计数器
word_count = Counter()
# 阈值,表示词在所有文章中至少出现的次数
threshold = 5
# CSV文件路径
csv_file_path = 'zhengwu.csv'
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-automation'])
# 屏蔽保存密码提示框
prefs = {'credentials_enable_service': False, 'profile.password_manager_enabled': False}
options.add_experimental_option('prefs', prefs)
# 反爬虫特征处理
options.add_argument('--disable-blink-features=AutomationControlled')
web = webdriver.Chrome(options=options)
# 打开CSV文件并读取数据
with open(csv_file_path, mode='r', encoding='utf-8') as csvfile:
csv_reader = csv.DictReader(csvfile)
# 遍历每一行数据(文章网址和标题)
for row in csv_reader:
# 获取文章内容的请求URL
url = row['链接']
title = row['标题']
print(title)
if title=='贵阳:一“增”一“减”破解政务服务难点':
break
else:
# 请求文章内容
web.get(url)
content = web.find_element('css selector','body > div.main.wrap.clearfix > div.content-left-wapper.l > article > div.article-body > div').text
# 使用jieba进行中文分词
words = jieba.lcut(content)
all_contents.extend(words)
# 统计所有词出现的次数
word_count.update(all_contents)
# 筛选出出现次数超过阈值的词
# 筛选出出现次数超过阈值的词
repeated_content = [word for word, count in word_count.items() if count > threshold]
# 将反复出现的内容保存到新的CSV文件中
with open('repeated_content.csv', mode='w', encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
# 写入标题行
writer.writerow(['反复出现的内容'])
# 遍历所有反复出现的内容并写入
for item in repeated_content:
print(item)
writer.writerow([item])