selenium:自动化抓取B站信息

代码实现效果

运行脚本后,会自动打开b站,按关键词搜索并抓取数据进行保存。

抓取下来的数据,自动保存为xlsx表格

 具体代码

"""base.py"""
import time

from selenium.webdriver import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC

from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait


class Base:

    # 初始化
    def __init__(self, driver):
        self.driver = driver

    # 查找元素方法
    def base_find_element(self, loc, timeout=10, poll_frequency=0.5):
        return WebDriverWait(driver=self.driver, timeout=timeout,     poll_frequency=poll_frequency).until(
            EC.presence_of_element_located(loc))

    # 点击方法
    def base_click(self, loc):
        self.base_find_element(loc).click()

    # 输入方法
    def base_input(self, loc, value):
        element = self.base_find_element(loc)
        element.clear()
        element.send_keys(value)

    # 获取文本方法
    def base_get_text(self, loc):
        msg = self.base_find_element(loc).text
        return msg

    # 截图
    def base_get_image(self, loc):
        self.driver.get_screenshot_as_file("./{}.png".format(time.strftime("%Y_%m_%d_%H_%M_%S")))

    # 移动鼠标
    def base_move_mouse(self, loc):
        ActionChains(self.driver).move_to_element(loc).perform()

 

from selenium import webdriver
from selenium.webdriver.common.by import By
from openpyxl import Workbook
import os.path
import requests
from Base.base import Base
import os
import pandas as pd
import time


def get_search(url, keys):
    # 创建驱动对象,输入B站网址
    driver = webdriver.Chrome()
    driver.get(url)
    # 最大化窗口
    driver.maximize_window()
    time.sleep(1)
    # 搜索框搜索
    base = Base(driver)
    base.base_find_element((By.XPATH, '//input[@class="nav-search-input"]'))
    base.base_input((By.XPATH, '//input[@class="nav-search-input"]'), keys)
    base.base_click((By.XPATH, '//div[@class="nav-search-btn"]'))
    # 窗口切换
    for handle in driver.window_handles:
        driver.switch_to.window(handle)
        if keys in driver.title:
            break
    time.sleep(1)
    return driver


def sort_options(driver):
    print('1.综合排序\n 2.最多播放\n 3.最新发布\n 4.最多弹幕\n 5.最多收藏')
    outcome = int(input("请输入你的排序选项:"))

    if outcome == 1:
        print('===================================综合排序===============================================')
    elif outcome == 2:
        driver.find_element(By.XPATH, '//button[2][@class="vui_button vui_button--tab mr_sm"]').click()
        print('===================================最多播放===============================================')
    elif outcome == 3:
        driver.find_element(By.XPATH, '//button[3][@class="vui_button vui_button--tab mr_sm"]').click()
        print('===================================最新发布===============================================')
    elif outcome == 4:
        driver.find_element(By.XPATH, '//button[4][@class="vui_button vui_button--tab mr_sm"]').click()
        print('===================================最多弹幕===============================================')
    elif outcome == 5:
        driver.find_element(By.XPATH, '//button[5][@class="vui_button vui_button--tab mr_sm"]').click()
        print('===================================最多收藏===============================================')


def get_data(driver):
    all_data = []
    num = 1
    base = Base(driver)
    pages_locator = (By.CSS_SELECTOR, '.vui_pagenation--btns button:nth-last-child(2)')
    pages_num = base.base_get_text(pages_locator)  # 获取页数

    while num <= int(pages_num):
        print(f'=====================================正在保存第{num}页的数据内容=================================')

        try:
            page_next = driver.find_element(By.CSS_SELECTOR, '.vui_pagenation--btns button:nth-last-child(1)')
            base.base_move_mouse(page_next)
        except:
            print('===================================最后一页了===============================================')

        time.sleep(1)
        all_div = driver.find_elements(By.XPATH, '//div[@class="video-list row"]/div')  # 获取所有 视频元素
        for i in all_div:
            #   获取视频关键信息,并返回
            try:
                title = i.find_element(By.CSS_SELECTOR, 'div h3').get_attribute('title')
                link = i.find_element(By.CSS_SELECTOR, 'div a').get_attribute('href')
                up_name = i.find_element(By.CSS_SELECTOR, 'div p>a>span:nth-of-type(1)').text
                up_time = i.find_element(By.CSS_SELECTOR, 'div p>a>span:nth-of-type(2)').text[1:]
                play_num = i.find_element(By.CSS_SELECTOR, 'div span:nth-of-type(1)>span').text
                comments_num = i.find_element(By.CSS_SELECTOR, 'div div~span').text
                image_link = i.find_element(By.CSS_SELECTOR, 'div picture>img').get_attribute('src')
                item = {
                    '标题': title,
                    '视频链接': link,
                    'up主': up_name,
                    '发布时间': up_time,
                    '播放量': play_num,
                    '评论量': comments_num,
                    '封面': image_link
                }
                print(item)
                all_data.append(item)
            except Exception as e:
                print(f"抓取数据时出现错误: {e}")

        #   获取完该页关键信息后,翻页
        try:
            next_page_locator = (By.CSS_SELECTOR, '.vui_pagenation--btns button:nth-last-child(1)')
            base.base_click(next_page_locator)
            time.sleep(3)
            num += 1
            print('翻页成功')
        except:
            break

    return all_data


def save_excel(all_data, file_dir='../bibi_file', file_prefix='哔哩'):
    if not os.path.exists(file_dir):  # 如果没有找到文件夹,创建一个
        os.makedirs(file_dir)

    df = pd.DataFrame(all_data)
    file_name = "{}{}.xlsx".format(file_prefix, time.strftime("%Y_%m_%d_%H_%M_%S"))
    file_path = os.path.join(file_dir, file_name)

    df.to_excel(file_path, index=False)
    print(f"文件已成功保存到: {file_path}")


def main():
    url = 'https://www.bilibili.com/'
    keys = '软件测试'
    driver = get_search(url, keys)
    driver.implicitly_wait(10)
    sort_options(driver)
    all_data = get_data(driver)
    save_excel(all_data)
    driver.quit()


if __name__ == '__main__':
    main()

 

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值