代码:
import os
import time
import json
import re
import requests
import threading
import math
from tqdm import tqdm
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.common.by import By
def get_cookie(website:str):
driver = webdriver.Chrome()
driver.get(website)
'''
打开网页后直接登录
手动登录完成后在命令行内按回车,因为我用input阻塞了
等提示进程已结束,退出代码,可以看到多出一个名为 jsoncookie.json的文件。里面存的是cookie
'''
driver.implicitly_wait(10)
input("Enter any key after you login")
dictcookie = driver.get_cookies()
print('dictcookie:',dictcookie)
jsoncookie = json.dumps(dictcookie)
print('jsoncookie:',jsoncookie)
with open('data/jsoncookie.json','w') as f:
f.write(jsoncookie)
driver.close()
def login_bilibili():
driver = webdriver.Chrome()
driver.get('https://www.bilibili.com/')
# 删除本次打开网页时的所有cookie
driver.delete_all_cookies()
with open('data/jsoncookie.json','r') as f:
ListCookies = json.loads(f.read())
# 将jsoncookie.json里的cookie写入本次打开的浏览器中。
for cookie in ListCookies:
driver.add_cookie({
'domain': '.bilibili.com',
'name': cookie['name'],
'value': cookie['value'],
'path': '/',
'expires': None,
'httponly': False,
})
driver.get('https://www.bilibili.com/')
class GetInfo():
def __init__(self, user_id, sav_dir:str):
self.a_list = [] # 存储每一个视频的url
self.d = webdriver.Chrome() # 初始化Chrome浏览器驱动
self.user_id = user_id
self.base_url = f'https://space.bilibili.com/{user_id}/video'
cookie_file = 'data/jsoncookie.json'
if not os.path.exists(cookie_file):
# get_cookie
get_cookie(self.base_url)
# login using cookie
self.d.get(self.base_url)
self.d.delete_all_cookies()
with open(cookie_file,'r') as f:
cookies = json.loads(f.read())
for cookie in cookies:
self.d.add_cookie({
'domain': '.bilibili.com',
'name': cookie['name'],
'value': cookie['value'],
'path': '/',
'expires': None,
'httponly': False,
})
self.d.get(self.base_url)
time.sleep(4)
self.data_list = []
self.sav_dir = sav_dir
os.makedirs(self.sav_dir, exist_ok=True)
self.sav_audio_dir = os.path.join(sav_dir, 'audios')
self.sav_video_dir = os.path.join(sav_dir, 'videos')
os.makedirs(self.sav_audio_dir, exist_ok=True)
os.makedirs(self.sav_video_dir, exist_ok=True)
self.url_path = os.path.join(self.sav_dir, "url.json")
self.info_path = os.path.join(self.sav_dir, 'res.json')
def get_url(self):
# 从当前页面获取所有视频的URL并保存到本地文件
ul = WebDriverWait(self.d, 10).until(lambda x: x.find_element(By.XPATH, '//*[@id="submit-video-list"]/ul[1]'))
lis = ul.find_elements(By.XPATH, "li")
for li in lis:
self.a_list.append(li.get_attribute("data-aid"))
with open(self.url_path, "w+", encoding="utf-8") as f:
data = json.dumps(self.a_list, ensure_ascii=False) # 确保中文字符正常保存
f.write(data)
def next_page(self):
# 遍历所有页面,获取所有视频的URL)
total = WebDriverWait(self.d, 10).until(lambda x: x.find_element(By.XPATH, '//*[@id="submit-video-list"]/ul[3]/span[1]'))
number = re.findall(r"\d+", total.text)
total = int(number[0])
for page in range(1, total + 1):
try:
self.d.find_element(By.LINK_TEXT, '下一页').click()
time.sleep(3) # 等待页面加载
self.get_url() # 修复方法名错误
except Exception as e:
print(f"Failed to click next page {page}: {e}")
return self.a_list
def get_video(self, urls, start, end):
def download_file(session, file_url, file_path, chunk_size=1024):
resp = session.get(file_url)
# print(resp.status_code)
# 获取文件大小
# file_size = int(resp.headers['content-length'])
# 用于记录已经下载的文件大小
done_size = 0
# 将文件大小转化为MB
# file_size_MB = file_size / 1024 / 1024
# start_time = time.time()
with open(file_path, mode='wb') as f:
for chunk in resp.iter_content(chunk_size=chunk_size):
f.write(chunk)
done_size += len(chunk)
# print(f'\r下载进度:{done_size/file_size*100:0.2f}%',end='')
# end_time = time.time()
# cost_time = end_time-start_time
# print(f'\n累计耗时:{cost_time:0.2f} 秒')
# print(f'下载速度:{file_size_MB/cost_time:0.2f}M/s')
# 使用requests.Session()来复用TCP连接
with requests.Session() as session:
session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.0.0 Safari/537.36",
"referer": "https://www.bilibili.com"
})
base_url = "http://www.bilibili.com/video/"
# 预编译正则表达式以提高性能
title_pattern = re.compile(r'<title data-vue-meta="true">([^&]+)</title>')
play_count_pattern = re.compile(r'视频播放量 (\d+)')
danmu_count_pattern = re.compile(r'弹幕量 (\d+)')
like_count_pattern = re.compile(r'点赞数 (\d+)')
coin_count_pattern = re.compile(r'投硬币枚数 (\d+)')
favorite_count_pattern = re.compile(r'收藏人数 (\d+)')
share_count_pattern = re.compile(r'转发人数 (\d+)')
for url in tqdm(urls[int(start):int(end)]):
full_url = base_url + url
audio_file_path = os.path.join(self.sav_audio_dir, url + '.mp3')
video_file_path = os.path.join(self.sav_video_dir, url + '.mp4')
if os.path.exists(audio_file_path) and os.path.exists(video_file_path):
continue
try:
response = session.get(full_url)
if response.status_code == 200:
string = response.text
# download audio and video
palyinfo = re.findall(r'<script>window.__playinfo__=(.*?)</script>', string)[0]
palyinfo_data = json.loads(palyinfo)
audio_url = palyinfo_data['data']['dash']['audio'][0]['base_url']
video_url = palyinfo_data['data']['dash']['video'][0]['base_url']
if not os.path.exists(audio_file_path):
download_file(session, audio_url, audio_file_path)
if not os.path.exists(video_file_path):
download_file(session, video_url, video_file_path)
time.sleep(1)
# 使用正则表达式提取视频信息
title_match = title_pattern.search(string)
title = title_match.group(1) if title_match else "未找到匹配的内容"
# 提取视频播放量、弹幕量等信息
play_count = play_count_pattern.search(string).group(1) if play_count_pattern.search(
string) else '0'
danmu_count = danmu_count_pattern.search(string).group(1) if danmu_count_pattern.search(
string) else '0'
like_count = like_count_pattern.search(string).group(1) if like_count_pattern.search(
string) else '0'
coin_count = coin_count_pattern.search(string).group(1) if coin_count_pattern.search(
string) else '0'
favorite_count = favorite_count_pattern.search(string).group(
1) if favorite_count_pattern.search(string) else '0'
share_count = share_count_pattern.search(string).group(1) if share_count_pattern.search(
string) else '0'
# 将提取的信息添加到self.data_list中
video_info = {
"url":full_url,
"title": title,
"play_count": play_count,
"danmu_count": danmu_count,
"like_count": like_count,
"coin_count": coin_count,
"favorite_count": favorite_count,
"share_count": share_count
}
self.data_list.append(video_info)
else:
print(f"Failed {full_url}: HTTP {response.status_code}")
except Exception as e:
print(f"Failed to get video info for url {full_url}: {e}")
def run(self, num_threads=8):
if not os.path.exists(self.url_path):
# 运行整个流程
self.get_url() # 获取当前页面的视频URL
self.next_page() # 遍历所有页面获取视频URL
with open(self.url_path, "r", encoding="utf-8") as f:
data = json.load(f)
# 使用多线程提高数据获取效率
threads = []
part = int(math.ceil(len(data) / num_threads))
for i in range(num_threads):
start = i * part
end = (i + 1) * part if i != num_threads-1 else len(data)
thread = threading.Thread(target=self.get_video, args=(data, start, end))
threads.append(thread)
thread.start()
for thread in threads:
thread.join() # 等待所有线程完成
# 所有线程完成后,保存数据到json
with open(self.info_path, 'w') as f:
json.dump(self.data_list, f, ensure_ascii=False, indent=4)
def get_blogger_info():
return [
{
"id": "1111111",
"name": "时代的"
}
]
def get_info():
res_dir = '/Users/me/Datasets/crawl_bilibili'
blogger_infos = get_blogger_info()
for blogger in blogger_infos:
func = GetInfo(blogger["id"], res_dir)
func.run()
if __name__ == "__main__":
# login_bilibili()
get_info()
代码为参考以下内容整合而成: