selenium简单使用

selenium简单使用

# -*- coding = utf-8 -*-
# @Time : 2021/1/22/0022 11:38
# @Author : WXH
# @File : myselenium.py
# @Software : PyCharm
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import xlwt

chrome_options = webdriver.ChromeOptions()
# 下面这行解决崩溃问题 也可能是driver和chrome不匹配
chrome_options.add_argument('''--no-sandbox''')
# 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('''--disable-gpu''')
browser = webdriver.Chrome(options=chrome_options)
WAIT = WebDriverWait(browser, 10)

book=xlwt.Workbook(encoding='utf-8',style_compression=0)

sheet=book.add_sheet('蔡徐坤篮球',cell_overwrite_ok=True)
sheet.write(0,0,'名称')
sheet.write(0,1,'地址')
sheet.write(0,2,'观看')
sheet.write(0,3,'弹幕')
sheet.write(0,4,'上传时间')
sheet.write(0,5,'up主')
n=1
total=0


def search():
    try:
        print('开始访问b站....')
        browser.get("https://www.bilibili.com")
        input = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#nav_searchform > input")))
        submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="nav_searchform"]/div/button')))
        input.send_keys('蔡徐坤 篮球')
        submit.click()

        # 跳转到新的窗口
        print('跳转到新窗口')
        all_h = browser.window_handles
        browser.switch_to.window(all_h[1])

        global total
        total = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.last > button"))).text
        print('已爬取1页...共' + str(total) + '页...')
        get_source()
        return
    except TimeoutException:
        return


def get_source():
    WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#bili-search')))
    html = browser.page_source
    soup = BeautifulSoup(html,'lxml')
    save_to_excel(soup)


def save_to_excel(soup):
    list = soup.find(class_='video-list clearfix').find_all(class_='video-item matrix')

    for item in list:
        item_title = item.find('a').get('title')
        item_link = item.find('a').get('href')
        item_dec = item.find(class_='so-icon watch-num').text
        item_view = item.find(class_='so-icon hide').text
        item_biubiu = item.find(class_='so-icon time').text
        item_date = item.find(class_='so-icon').text

        print('爬取:' + item_title)

        global n

        sheet.write(n, 0, item_title)
        sheet.write(n, 1, item_link)
        sheet.write(n, 2, item_dec)
        sheet.write(n, 3, item_view)
        sheet.write(n, 4, item_biubiu)
        sheet.write(n, 5, item_date)

        n = n + 1

def next_page(page_num):
    try:
        global total
        print('获取下一页数据')
        next_btn = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button')))
        next_btn.click()
        WAIT.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.active > button'),str(page_num)))
        get_source()
        print('已爬取' + str(page_num) + '页...共' + str(total) + '页...')
    except TimeoutException:
        browser.refresh()
        return next_page(page_num)


def main():

    try:
        global total
        search()
        for i in range(2, int(total)+1):
            next_page(i)

    finally:
        browser.close()


if __name__ == '__main__':
    main()
    book.save(u'蔡徐坤篮球.xlsx')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值