导入模块
from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time
import openpyxl
from openpyxl import load_workbook
import re
from bs4 import BeautifulSoup
关键词列表
#关键词列表
kws=["人工智能透明","算法透明","推荐算法透明","推送透明","黑箱","算法黑箱","推荐算法黑箱","推送黑箱","算法公开","算法可解释"]
kw=kws[0]
#首先 启动浏览器
driver=webdriver.Chrome()
#driver.get(ur[0])
driver.get('https://www.baidu.com/s?ie=UTF-8&wd=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%E9%80%8F%E6%98%8E')
driver.find_element(By.XPATH,"/html/body/div[2]/div[2]/div/div/a[5]").click() #点击资讯
time.sleep(5)
#运行
for ikw in range(1,len(kws)):
kw=kws[ikw]
print(kw)
wb = openpyxl.Workbook() # 创建一个excel文件
sheet = wb.active # 获得一个的工作表
sheet.title = kw
wb.save(r"baidunews-{}.xlsx".format(kw))
driver.switch_to.window(driver.window_handles[-1]) # 回到搜索页
search_button=driver.find_element(By.ID,"kw")
search_button.clear()
search_button.send_keys(kw) #填搜索框
driver.find_element(By.ID,"su").click() #搜索
time.sleep(5)
datamining(wb,kw,driver)
#单次测试
datamining(wb,kw,driver)
#主程序函数
def datamining(wb,kw,driver):
sheet = wb.active
sheet.cell(1,1).value="日期"
sheet.cell(1,2).value="标题"
sheet.cell(1,3).value="文章来源"
sheet.cell(1,4).value="简述"
sheet.cell(1,5).value="URL"
current_row=sheet.max_row+1
for x in range(100):
#当页操作
driver.switch_to.window(driver.window_handles[-1])
soup = BeautifulSoup(driver.page_source)
#找出 本页 非广告的各个element
linkElems = soup.select('div.c-container')
if x==0:
former_linkElems=0
if former_linkElems==linkElems:
print("两页重复")
return 0
Elems_title_list=[]
this_page_case_list=[]
Elems_title_list.append(linkElems[0].get_text().strip())
this_page_case_list.append(linkElems[0])
for i in range(1,len(linkElems)):
title=linkElems[i].get_text().strip()
if bool(re.search("广告",str(title))) or bool(re.search("大家还在搜",str(title))) :
pass
else:
if title not in Elems_title_list:
Elems_title_list.append(title)
this_page_case_list.append(linkElems[i])
#本页case数量
this_page_case_num=len(this_page_case_list)
print("第"+str(x+1)+"页,本页case数目为 "+str(this_page_case_num))
for i in range(this_page_case_num):
#标题
case_title=Elems_title_list[i]
#print(case_title)
#时间
try:
case_time = this_page_case_list[i].select('span.c-color-gray2')
case_time=case_time[0].get_text().strip()
except:
case_time="NaN"
#print(case_time)
#来源
try:
case_sourse = this_page_case_list[i].select('span.c-color-gray')
case_sourse=case_sourse[0].get_text().strip()
except:
case_sourse="NaN"
#print(case_sourse)
#简述
try:
case_short = this_page_case_list[i].select('span.content-right_8Zs40')
case_short=case_short[0].get_text().strip()
except:
case_short="NaN"
#print(case_short)
#网址URL
urls=this_page_case_list[i].find_all('a', href=True,target="_blank")
case_url=urls[0]['href']
#print(case_url)
sheet.cell(current_row,1).value=case_title
sheet.cell(current_row,2).value=case_time
sheet.cell(current_row,3).value=case_sourse
sheet.cell(current_row,4).value=case_short
sheet.cell(current_row,5).value=case_url
wb.save(r"E:\桌面备份\武大帅爬虫任务\baidunews-{}.xlsx".format(kw))
print(kw+"项搜索词 已保存 "+str(current_row)+" 项")
current_row=sheet.max_row+1
#翻页指令
former_linkElems=linkElems
try:
button=driver.find_element(By.XPATH,"/html/body/div/div[3]/div[2]/div/a[last()]")
except:
button=driver.find_element(By.XPATH,"/html/body/div/div[3]/div[2]/div/a[last()]")
if re.search("下一页",button.get_attribute('innerHTML')):
button.click()
time.sleep(6)
else:
print("没有下一页按钮,爬虫中断")
return 0