selenium简单使用

最新推荐文章于 2024-01-16 20:03:40 发布

「已注销」

最新推荐文章于 2024-01-16 20:03:40 发布

阅读量91

点赞数

分类专栏： python

本文链接：https://blog.csdn.net/qq_51962013/article/details/112983053

版权

python 专栏收录该内容

4 篇文章 0 订阅

订阅专栏

selenium简单使用

# -*- coding = utf-8 -*-
# @Time : 2021/1/22/0022 11:38
# @Author : WXH
# @File : myselenium.py
# @Software : PyCharm
from selenium import webdriver
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import xlwt

chrome_options = webdriver.ChromeOptions()
# 下面这行解决崩溃问题 也可能是driver和chrome不匹配
chrome_options.add_argument('''--no-sandbox''')
# 谷歌文档提到需要加上这个属性来规避bug
chrome_options.add_argument('''--disable-gpu''')
browser = webdriver.Chrome(options=chrome_options)
WAIT = WebDriverWait(browser, 10)

book=xlwt.Workbook(encoding='utf-8',style_compression=0)

sheet=book.add_sheet('蔡徐坤篮球',cell_overwrite_ok=True)
sheet.write(0,0,'名称')
sheet.write(0,1,'地址')
sheet.write(0,2,'观看')
sheet.write(0,3,'弹幕')
sheet.write(0,4,'上传时间')
sheet.write(0,5,'up主')
n=1
total=0


def search():
    try:
        print('开始访问b站....')
        browser.get("https://www.bilibili.com")
        input = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#nav_searchform > input")))
        submit = WAIT.until(EC.element_to_be_clickable((By.XPATH, '//*[@id="nav_searchform"]/div/button')))
        input.send_keys('蔡徐坤 篮球')
        submit.click()

        # 跳转到新的窗口
        print('跳转到新窗口')
        all_h = browser.window_handles
        browser.switch_to.window(all_h[1])

        global total
        total = WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR, "#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.last > button"))).text
        print('已爬取1页...共' + str(total) + '页...')
        get_source()
        return
    except TimeoutException:
        return


def get_source():
    WAIT.until(EC.presence_of_element_located((By.CSS_SELECTOR,'#bili-search')))
    html = browser.page_source
    soup = BeautifulSoup(html,'lxml')
    save_to_excel(soup)


def save_to_excel(soup):
    list = soup.find(class_='video-list clearfix').find_all(class_='video-item matrix')

    for item in list:
        item_title = item.find('a').get('title')
        item_link = item.find('a').get('href')
        item_dec = item.find(class_='so-icon watch-num').text
        item_view = item.find(class_='so-icon hide').text
        item_biubiu = item.find(class_='so-icon time').text
        item_date = item.find(class_='so-icon').text

        print('爬取：' + item_title)

        global n

        sheet.write(n, 0, item_title)
        sheet.write(n, 1, item_link)
        sheet.write(n, 2, item_dec)
        sheet.write(n, 3, item_view)
        sheet.write(n, 4, item_biubiu)
        sheet.write(n, 5, item_date)

        n = n + 1

def next_page(page_num):
    try:
        global total
        print('获取下一页数据')
        next_btn = WAIT.until(EC.element_to_be_clickable((By.CSS_SELECTOR, '#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.next > button')))
        next_btn.click()
        WAIT.until(EC.text_to_be_present_in_element((By.CSS_SELECTOR, '#all-list > div.flow-loader > div.page-wrap > div > ul > li.page-item.active > button'),str(page_num)))
        get_source()
        print('已爬取' + str(page_num) + '页...共' + str(total) + '页...')
    except TimeoutException:
        browser.refresh()
        return next_page(page_num)


def main():

    try:
        global total
        search()
        for i in range(2, int(total)+1):
            next_page(i)

    finally:
        browser.close()


if __name__ == '__main__':
    main()
    book.save(u'蔡徐坤篮球.xlsx')

「已注销」

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
selenium简单使用

solonium简单使用# -*- coding = utf-8 -*-# @Time : 2021/1/22/0022 11:38# @Author : WXH# @File : myselenium.py# @Software : PyCharmfrom selenium import webdriverfrom selenium.common.exceptions import TimeoutExceptionfrom selenium.webdriver.common.by impo
复制链接

扫一扫

专栏目录