百度爬虫 使用selenium + beautifulsoup 百度搜索关键词爬虫 代码整理

导入模块

from selenium.webdriver.common.by import By
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
import time
import openpyxl
from openpyxl import load_workbook
import re
from bs4 import BeautifulSoup

关键词列表

#关键词列表
kws=["人工智能透明","算法透明","推荐算法透明","推送透明","黑箱","算法黑箱","推荐算法黑箱","推送黑箱","算法公开","算法可解释"]
kw=kws[0]
#首先  启动浏览器

driver=webdriver.Chrome()
#driver.get(ur[0])

driver.get('https://www.baidu.com/s?ie=UTF-8&wd=%E4%BA%BA%E5%B7%A5%E6%99%BA%E8%83%BD%E9%80%8F%E6%98%8E')

driver.find_element(By.XPATH,"/html/body/div[2]/div[2]/div/div/a[5]").click()  #点击资讯

time.sleep(5)
#运行
for ikw in range(1,len(kws)):
    kw=kws[ikw]
    print(kw)
    
    wb = openpyxl.Workbook()        # 创建一个excel文件
    sheet = wb.active               # 获得一个的工作表
    sheet.title = kw
    wb.save(r"baidunews-{}.xlsx".format(kw))
    
    driver.switch_to.window(driver.window_handles[-1])  # 回到搜索页
    search_button=driver.find_element(By.ID,"kw")
    search_button.clear()
    search_button.send_keys(kw)  #填搜索框

    driver.find_element(By.ID,"su").click()  #搜索
    
    time.sleep(5)
    
    datamining(wb,kw,driver)
#单次测试
datamining(wb,kw,driver)
#主程序函数
def datamining(wb,kw,driver):

    sheet = wb.active
    sheet.cell(1,1).value="日期"
    sheet.cell(1,2).value="标题"
    sheet.cell(1,3).value="文章来源"
    sheet.cell(1,4).value="简述"
    sheet.cell(1,5).value="URL"
    current_row=sheet.max_row+1

    for x in range(100):
        #当页操作
        driver.switch_to.window(driver.window_handles[-1])
        soup = BeautifulSoup(driver.page_source)

        #找出  本页  非广告的各个element
        linkElems = soup.select('div.c-container')
        
        
        if x==0:
            former_linkElems=0
        if former_linkElems==linkElems:
            print("两页重复")
            return 0
        
        Elems_title_list=[]
        this_page_case_list=[]

        Elems_title_list.append(linkElems[0].get_text().strip())
        this_page_case_list.append(linkElems[0])

        for i in range(1,len(linkElems)):
            title=linkElems[i].get_text().strip()
            if bool(re.search("广告",str(title))) or bool(re.search("大家还在搜",str(title))) :
                pass
            else:
                if title not in Elems_title_list:
                    Elems_title_list.append(title)
                    this_page_case_list.append(linkElems[i])

        #本页case数量
        this_page_case_num=len(this_page_case_list)

        print("第"+str(x+1)+"页,本页case数目为 "+str(this_page_case_num))

        for i in range(this_page_case_num):

            #标题
            case_title=Elems_title_list[i]
            #print(case_title)

            #时间
            try:
                case_time = this_page_case_list[i].select('span.c-color-gray2')
                case_time=case_time[0].get_text().strip()
            except:
                case_time="NaN"

            #print(case_time)

            #来源
            try:
                case_sourse = this_page_case_list[i].select('span.c-color-gray')
                case_sourse=case_sourse[0].get_text().strip()
            except:
                case_sourse="NaN"
            #print(case_sourse)

            #简述
            try:
                case_short = this_page_case_list[i].select('span.content-right_8Zs40')
                case_short=case_short[0].get_text().strip()
            except:
                case_short="NaN"
            #print(case_short)


            #网址URL
            urls=this_page_case_list[i].find_all('a', href=True,target="_blank")
            case_url=urls[0]['href']
            #print(case_url)



            sheet.cell(current_row,1).value=case_title
            sheet.cell(current_row,2).value=case_time
            sheet.cell(current_row,3).value=case_sourse
            sheet.cell(current_row,4).value=case_short
            sheet.cell(current_row,5).value=case_url

            wb.save(r"E:\桌面备份\武大帅爬虫任务\baidunews-{}.xlsx".format(kw))
            print(kw+"项搜索词 已保存 "+str(current_row)+" 项")

            current_row=sheet.max_row+1

        #翻页指令
        former_linkElems=linkElems
        try:
            button=driver.find_element(By.XPATH,"/html/body/div/div[3]/div[2]/div/a[last()]")
        except:
            button=driver.find_element(By.XPATH,"/html/body/div/div[3]/div[2]/div/a[last()]")
        if re.search("下一页",button.get_attribute('innerHTML')):
            button.click()
            time.sleep(6)
        else:
            print("没有下一页按钮,爬虫中断")
            return 0
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值