前言
爬取b站up主投稿视频的八项数据,代码中限制了只爬取2023年的视频
代码
import json
from time import sleep
import requests
from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
import pandas as pd
def get_info(BV):
base_url = f'https://api.bilibili.com/x/web-interface/view?bvid={BV}'
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'}
r = requests.get(base_url, headers=header) # 获取网页的信息
_json_data = json.loads(r.text) # 将数据进行格式转换,这样方便处理
#合作
menbers= _json_data.get('data').get('staff')
pep=" "
if menbers!=None:
for menber in menbers:
pep=pep+menber['title']+":"
pep=pep+menber['name']+" "
view = _json_data.get('data').get('stat').get('view')#播放量
danmaku = _json_data.get('data').get('stat').get('danmaku')#弹幕
reply = _json_data.get('data').get('stat').get('reply')#评论
favorite = _json_data.get('data').get('stat').get('favorite')#收藏
coin = _json_data.get('data').get('stat').get('coin')#投币
share = _json_data.get('data').get('stat').get('share')#转发
like = _json_data.get('data').get('stat').get('like')#点赞
return [view,danmaku,reply,favorite,coin,share,like,pep]
driver = webdriver.Edge()
driver.get(r'https://space.bilibili.com/488055582/video?tid=0&pn=9&keyword=&order=pubdate')
data=[]
f=True
while f:
main_window_handle = driver.current_window_handle
# 定位指定元素
elements = driver.find_elements(By.CLASS_NAME,"small-item.fakeDanmu-item")
for element in elements:
if "2022" in element.find_element(By.CLASS_NAME, "time").text:
f=False
break
if "2023" in element.find_element(By.CLASS_NAME,"time").text:
time = element.find_element(By.CLASS_NAME,"time").text
title = element.find_element(By.CLASS_NAME,"title").text
length = element.find_element(By.CLASS_NAME,"length").text
try:
element.find_element(By.CLASS_NAME, "be-tags-container")
hz=True
except:
hz=False
info = get_info(element.get_attribute("data-aid"))
info.extend([title,time,length])
data.append(info)
driver.find_element(By.CLASS_NAME, "be-pager-next").click()
sleep(2)
sleep(5)
driver.close()
df = pd.DataFrame(data=data, columns=["播放量","弹幕","评论","收藏","投币","转发","点赞","合作","标题","发布时间","时长"])
df.to_excel("result.xlsx")