本文从【批量抓取页面的信息】,【批量下载抓取到链接】,【对比更新本地数据库】三个方面分别来说。
一、【批量抓取页面的信息】
想要实现的功能就是从某站的个人收藏夹或者个人主页抓取里面的所有视频的信息(aid,title,url),然后存入本地的收藏夹。主要涉及到Python-selenium(浏览器自动化工具)和Python-pandas(Excel操作工具)的使用。
# 爬取收藏夹所有作品信息,只读脚本(第一步)
import pandas as pd
from selenium.webdriver.edge.options import Options
from selenium import webdriver
from selenium.webdriver.common.by import By
import os
import time
# 获取收藏夹当页20个视频的ID
def get_one_page(wd, video_lists):
elements = wd.find_elements(By.CSS_SELECTOR,
'[class="fav-video-list clearfix content"]>li>a:nth-child(2)') # 个人收藏夹格式
for element in elements:
aid = element.get_attribute("href").replace('https://www.bilibili.com/video/', '')
aid = aid.replace('/', '')
title = element.get_attribute("title")
title = replace_x(title)
# 创建一个字典存放网址、aid和作品名称(后续还要加入作者和发布时间)
video_dict = {'aid': aid,
'title': title,
'url': element.get_attribute("href")}
print(video_dict)
# 将字典存入列表中
video_lists.append(video_dict)
return video_lists
# 获取作者主页当页30个视频的ID
def get_up_page(wd, video_lists):
elements = wd.find_elements(By.CSS_SELECTOR, '.small-item a:nth-child(2)') # 个人收藏夹格式
for element in elements:
aid = element.get_attribute("href").replace('https://www.bilibili.com/video/', '')
aid = aid.replace('/', '')
title = element.get_attribute("title")
title = replace_x(title)
# 创建一个字典存放网址、aid和作品名称(后续还要加入作者和发布时间)
video_dict = {'aid': aid,
'title': title,
'url': element.get_attribute("href")}
print(video_dict)
# 将字典存入列表中
video_lists.append(video_dict)
return video_lists
def open_web(url):
# 创建 WebDriver 对象
# 访问网站(以带用户信息的浏览器形式打开,可以直接用本机账户登录网站)
user_data_dir = r"C:\Users\yourCPname\AppData\Local\Microsoft\Edge\User Data"
options = Options()
options.add_argument(f"user-data-dir={user_data_dir}")
wd = webdriver.Edge(options=options)
# 输入网址
time.sleep(3)
wd.get(url)
input('请登录后继续。。。')
return wd
# 替换掉不能命名的字符
def replace_x(words):
words = words.replace('/', '')
words = words.replace(':', '')
words = words.replace('*', '')
words = words.replace('?', '')
words = words.replace('〈', '')
words = words.replace('〉', '')
words = words.replace('|', '')
return words
# 存储列表
def save_list(video_lists, path, name):
for video_list in video_lists:
print(video_list)
print('共采集到', len(video_lists), '条视频')
# 转变工作目录到指定文件夹
os.chdir(path)
# 创建DataFrame对象
df = pd.DataFrame(video_lists)
# 保存为Excel文件
pd.DataFrame(video_lists, columns=['aid', "title", "url"]). \
to_excel(file_path + name, index=False)
print('列表已保存:', path + name)
# 主函数
file_path = 'D:\\下载站\\bilibili\\download\\'
name = 'video_info.xlsx'
# 输入收藏夹页网址
url = 'https://space.bilibili.com/xxxxx/favlist' # 输入用户收藏夹
# url = 'https://space.bilibili.com/xxxxx/video' #或用户主页
wd = open_web(url)
video_lists = []
# 获取总页数
time.sleep(2)
while True:
try:
pages = wd.find_element(By.CSS_SELECTOR, '.be-pager-total').text
pages = pages.replace('共 ', '')
pages = pages.replace(' 页,', '')
pages = int(pages)
except:
pages = 1
print('收藏夹共', pages, '页')
for i in range(1, pages + 1):
# 点击下一页
print('正在采集第', i, '页')
try:
# video_lists = get_one_page(wd, video_lists)#获取收藏夹页则运行这一行代码
video_lists = get_up_page(wd, video_lists) # 获取作者主页则运行这一行代码
wd.find_element(By.CSS_SELECTOR, 'ul.be-pager > li.be-pager-next').click()
time.sleep(2)
except:
print('已经到底啦')
save_list(video_lists, file_path, name)
# 是否采集下一个
print('是否采集下一个(1/0)')
is_over = int(input())
if is_over == 0:
break
else:
video_lists = []
print('请输入保存文件名称:')
name = input() + '.xlsx'
二、【批量下载抓取到链接】
现在我们得到了一个Excel表格,里面包含众多视频信息,可以通过Python-request进行下载,另外,因为某站视频和音频是分开的,需要单独下载后进行合并,用到Python-subprocess工具。
需要注意的是,在请求头中如果空白,只能下载720P及以下清晰度的视频。登录个人账号后,打开任意视频,刷新网页,在F12-网络中,找到第一个元素打开请求标头,复制cookie,放入headers中,则可下载本账号能观看的最高清晰度。
批量下载视频。
# 下载多个视频(可选清晰度,可赋值发布时间,可以显示下载进度条)
import subprocess # 导入进程模块
import winsound
# 导入格式化输出模块
import os
import re
import requests
import json
from pprint import pprint
import pandas as pd
# 导入时间模块
from datetime import datetime
from tqdm import tqdm
# 获取到列表Video_lists
def Excel_List(file_path, excel_name):
# 转变工作目录到指定文件夹
os.chdir(file_path)
# 读取 Excel 文件
df = pd.read_excel(excel_name)
# 转换为列表
data = df.values.tolist()
Video_lists = []
for video_list in data:
video_dict = {'aid': video_list[0],
'title': video_list[1],
'url': video_list[2]}
# print(video_dict)
Video_lists.append(video_dict)
# for video_list in Video_lists:
# print(video_list)
print('共收藏了', len(Video_lists), '条视频')
return Video_lists
# 通过ID获取响应的函数
def request_header(url):
# 加上一个请求头,伪装成浏览器(header是一个字典数据类型),F12或右键检查,刷新网页,随便一个数据包都可以获得UA
headers = {
# cookie:用户信息,登录或不登录都有
'cookie': 'your own cookie',
# 防盗链
'Referer': 'your own Referer',
# user-agent:浏览器信息,版本,电脑'
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0'
}
response = requests.get(url, headers=headers)
# print(response.text)
return response, headers
# 转码(将视频信息转码后提取url)
def transcoding(video_info, select_definition):
# html_data = requests.utils.unquote(video_info)
# 转换数据类型(把字符串数据类型转换为json字典类型)
json_data = json.loads(video_info)
# print(video_info)
# print(json_data)
# print(type(video_info))
# print(type(json_data))
# pprint(json_data)
# B站的数据内容是分为两段的,音频和视频是分开的,因此需要分别提取
# 提取视频连接 键值对取值--通过冒号左边的内容,提取冒号右边的内容
select_definition = 0 # 视频清晰度选择0一般为最高清晰度
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
video_url = json_data['data']['dash']['video'][select_definition]['baseUrl']
print('帧高度:', json_data['data']['dash']['video'][select_definition]['height'],
'帧宽度:', json_data['data']['dash']['video'][select_definition]['width'])
# print(audio_url)
# print(video_url)
# 403 Forbidden 没有访问权限,需要在请求头里面加一个防盗链
return audio_url, video_url
# 保存数据的函数(下载视频和音频并合成)
def Download_Bilibili(file_path, video_name, video_url, audio_url, headers): # 保存数据
print('正在下载音频...')
response = requests.get(audio_url, headers=headers, stream=True)
total_size = int(response.headers.get('content-length', 0))
with open('audio.mp3', 'wb') as file, tqdm(
desc='audio.mp3',
total=total_size,
unit='B',
unit_scale=True,
unit_divisor=1024,
ncols=100, # 设置进度条宽度
colour='green', # 设置进度条颜色为绿色
) as bar:
for data in response.iter_content(chunk_size=1024):
file.write(data)
bar.update(len(data))
print('正在下载视频...')
response = requests.get(video_url, headers=headers, stream=True)
total_size = int(response.headers.get('content-length', 0))
with open('video.mp4', 'wb') as file, tqdm(
desc='video.mp4',
total=total_size,
unit='B',
unit_scale=True,
unit_divisor=1024,
ncols=100, # 设置进度条宽度
colour='green', # 设置进度条颜色为绿色
) as bar:
for data in response.iter_content(chunk_size=1024):
file.write(data)
bar.update(len(data))
# 合成视频和音频
print('正在合成...')
cmd = f"ffmpeg -i {file_path+'video'}.mp4 -i {file_path+'audio'}.mp3 -c:v copy -c:a aac -strict experimental {file_path}output.mp4 -loglevel quiet"
subprocess.run(cmd, shell=True)
# 删除原文件,重命名新文件
os.remove(file_path+'video.mp4')
os.remove(file_path+'audio.mp3')
os.rename(file_path + 'output.mp4', file_path+video_name + '.mp4')
# 修改时间的函数
def modify_time(file_path, video_name, release_time):
a = datetime.strptime(release_time, '%Y-%m-%d %H:%M:%S')
a.timestamp()
os.utime(file_path + video_name + '.mp4', (a.timestamp(), a.timestamp())) # 只能修改 访问时间 与 修改时间(不能修改创建时间)
# 替换掉不能命名的字符
def replace_x(words):
words = words.replace('/', '')
words = words.replace(':', '')
words = words.replace('*', '')
words = words.replace('?', '')
words = words.replace('〈', '')
words = words.replace('〉', '')
words = words.replace('|', '')
return words
# 发出提示音
def Sound_beep():
duration = 300 # millisecond
freqA4 = 440 # Hz
freqC5 = 523
freqE5 = 659
winsound.Beep(freqA4, duration)
winsound.Beep(freqC5, duration)
winsound.Beep(freqE5, duration)
# 主函数
# 文件保存位置
file_path = 'D:\\下载站\\bilibili\\download\\' # 视频储存路径
# # 获取到列表Video_lists
Video_lists = Excel_List(file_path, 'name.xlsx')
# Video_lists = Excel_List(file_path, 'unload_video_list.xlsx')
Video_Failed = []
for i in range(0, len(Video_lists)):
try:
print('正在下载第', i + 1, '个视频', ',共', len(Video_lists), '条视频')
print(Video_lists[i])
ID = Video_lists[i]['aid']
# 请求url
url = 'https://www.bilibili.com/video/' + ID + '/?vd_source=a3ea8651bac6d99d5eab35d21a5044fc'
# 将作品ID对应的URL请求响应输出(请求头写在函数里)
(response, headers) = request_header(url)
# print(response.text)
# 视频信息(找playinfo在哪)
video_info = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
# print('video_info:\t', type(video_info))
# 作者姓名
author_name = \
re.findall(
'<meta data-vue-meta="true" itemprop="author" name="author" content="(.*?)"><meta data-vue-meta="true"',
response.text)[0]
author_name = replace_x(author_name)
author_name = '【' + author_name + '】'
print('作者姓名:\t', author_name)
# 作品文案
title = re.findall('<h1 data-title="(.*?)" title="', response.text)[0]
title = replace_x(title)
print('作品标题:\t', title)
video_name = author_name + ID + '【' + title + '】'
# print(video_name)
# 时间
release_time = re.findall('<meta data-vue-meta="true" itemprop="uploadDate" content="(.*?)">', response.text)[0]
print('time:\t', release_time)
# 转码(抖音需要转码,B站不需要)
(audio_url, video_url) = transcoding(video_info, 4) # 0表示最高清晰度,-1表示最低清晰度
# 下载视频
Download_Bilibili(file_path, video_name, video_url, audio_url, headers)
# 更正时间
modify_time(file_path, video_name, release_time)
# 发出信号
Sound_beep()
except:
Video_Failed.append(Video_lists[i])
print('出现错误,可能导致下载失败!')
print('视频失败列表:', '\n', Video_Failed)
批量下载封面
# 下载多个缩略图
import subprocess # 导入进程模块
import winsound
# 导入格式化输出模块
import os
import re
import requests
import json
from pprint import pprint
import pandas as pd
# 导入时间模块
from datetime import datetime
# 获取到列表Video_lists
def Excel_List(file_path, excel_name):
# 转变工作目录到指定文件夹
os.chdir(file_path)
# 读取 Excel 文件
df = pd.read_excel(excel_name)
# 转换为列表
data = df.values.tolist()
Video_lists = []
for video_list in data:
video_dict = {'aid': video_list[0],
'title': video_list[1],
'url': video_list[2]}
# print(video_dict)
Video_lists.append(video_dict)
# for video_list in Video_lists:
# print(video_list)
print('共收藏了', len(Video_lists), '条视频')
return Video_lists
# 通过ID获取响应的函数
def request_header(url):
# 加上一个请求头,伪装成浏览器(header是一个字典数据类型),F12或右键检查,刷新网页,随便一个数据包都可以获得UA
headers = {
# cookie:用户信息,登录或不登录都有
'cookie': 'your cookie',
# 防盗链
'Referer': 'your Referer',
# user-agent:浏览器信息,版本,电脑'
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'Safari/537.36 Edg/116.0.1938.62 '
'Host': 'api.bilibili.com'
}
response = requests.get(url, headers=headers)
# print(response.text)
return response, headers
# 保存数据的函数(下载封面图)
def Download_Cover(video_name, response): # 保存数据
cover_info_jpg = re.findall('itemprop="image" content="//(.*?).jpg', response.text)[0]
cover_info_png = re.findall('itemprop="image" content="//(.*?).png', response.text)[0]
cover_url_jpg = 'https://' + cover_info_jpg + '.jpg'
cover_url_png = 'https://' + cover_info_png + '.png'
print(cover_url_jpg)
print(cover_url_png)
# 下载封面图
if cover_url_jpg:
response1 = requests.get(cover_url_jpg)
response2 = requests.get(cover_url_png)
if response1.status_code == 200:
with open(video_name + '.jpg', 'wb') as img_file:
img_file.write(response1.content)
print('封面图已成功下载为', video_name, '.jpg')
elif response2.status_code == 200:
with open(video_name + '.png', 'wb') as img_file:
img_file.write(response2.content)
print('封面图已成功下载为', video_name, '.png')
else:
print('无法下载封面图')
else:
print('未找到封面图链接')
# 修改时间的函数
def modify_time(file_path, video_name, release_time):
a = datetime.strptime(release_time, '%Y-%m-%d %H:%M:%S')
a.timestamp()
os.utime(file_path + video_name + '.jpg', (a.timestamp(), a.timestamp())) # 只能修改 访问时间 与 修改时间(不能修改创建时间)
# 替换掉不能命名的字符
def replace_x(words):
words = words.replace('/', '')
words = words.replace(':', '')
words = words.replace('*', '')
words = words.replace('?', '')
words = words.replace('〈', '')
words = words.replace('〉', '')
words = words.replace('|', '')
return words
# 发出提示音
def Sound_beep():
duration = 300 # millisecond
freqA4 = 440 # Hz
freqC5 = 523
freqE5 = 659
winsound.Beep(freqA4, duration)
winsound.Beep(freqC5, duration)
winsound.Beep(freqE5, duration)
# 主函数
# 文件保存位置
file_path = 'D:\\下载站\\bilibili\\download\\' # 读取列表和下载路径
# 获取到列表Video_lists
Video_lists = Excel_List(file_path, 'name.xlsx')
for i in range(0, len(Video_lists)):
try:
print('正在下载第', i + 1, '个视频封面', ',共', len(Video_lists), '条视频')
print(Video_lists[i])
ID = Video_lists[i]['aid']
# 请求url
url = 'https://www.bilibili.com/video/' + ID + '/?spm_id_from=333.999.0.0&vd_source=a3ea8651bac6d99d5eab35d21a5044fc'
# 将作品ID对应的URL请求响应输出(请求头写在函数里)
(response, headers) = request_header(url)
# 视频信息(找playinfo在哪)
video_info = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
# print('video_info:\t', type(video_info))
# 作者姓名
author_name = \
re.findall(
'<meta data-vue-meta="true" itemprop="author" name="author" content="(.*?)"><meta data-vue-meta="true"',
response.text)[0]
author_name = replace_x(author_name)
author_name = '【' + author_name + '】'
print('作者姓名:\t', author_name)
# 作品文案
title = re.findall('<h1 data-title="(.*?)" title="', response.text)[0]
title = replace_x(title)
print('作品标题:\t', title)
video_name = author_name + ID + '【' + title + '】'
# print(video_name)
# 时间
release_time = re.findall('<meta data-vue-meta="true" itemprop="uploadDate" content="(.*?)">', response.text)[0]
print('time:\t', release_time)
# 下载封面
cover_info = re.findall('itemprop="image" content="//(.*?).jpg', response.text)[0]
Download_Cover(video_name, response)
# 更正时间
modify_time(file_path, video_name, release_time)
except:
print('下载失败!')
# 发出信号
# Sound_beep()
三、【对比更新本地数据库】
假如说其中有下载失败,或是下次想继续更新下载新的视频,可以重复操作一后,刷新Excel列表。然后通过下面的脚本实现本地数据和列表数据的对比,找出那些视频是列表里有的而本地没有的(未下载视频),那些视频是下载到本地的但是在网络上挂掉的(已失效视频)。Ps:第二种查找方式虽然没什么用,但是每每看见搜出来的失效视频列表,可以得到某种“还好缓存了”的满足感。
# 比较收藏夹列表和已经下载的视频列表,看看谁没下(本地没有网上有),谁失效(本地有网上没有)
## 检查已下载的视频是否在列表中
import winsound
# 导入格式化输出模块
import os
import re
import requests
# 导入时间模块
from datetime import datetime
import shutil
import pandas as pd
# 获取收藏夹列表Video_lists
def Excel_List(file_path, name):
# 转变工作目录到指定文件夹
os.chdir(file_path)
# 读取 Excel 文件
df = pd.read_excel(name)
# 转换为列表
data = df.values.tolist()
Video_lists = []
for video_list in data:
video_dict = {'aid': video_list[0],
'title': video_list[1],
'url': video_list[2]}
# print(video_dict)
Video_lists.append(video_dict)
# for video_list in Video_lists:
# print(video_list)
print('共收藏了', len(Video_lists), '条视频')
return Video_lists
# 获取已下载视频到Vols
def Vols_List(folder_path):
# 转变工作目录到指定文件夹
file_list = os.listdir(folder_path)
# Vols = [f.name for f in os.scandir(folder_path) if f.is_dir()] # 查找所有文件夹
Vols = [] # 查找所有视频
for file in file_list:
if file.endswith('.mp4') or file.endswith('.avi') or file.endswith('.mov'):
Vols.append(file)
# for Vol in Vols:
# print(Vol) # 输出所有子文件夹名称
print('共存储了', len(Vols), '条视频')
return Vols
# 搜索文件名是否在收藏夹中
def Is_exists_in_favorite(Vol, Video_lists):
is_exists = 0
Volaid = index = Vol.find('BV')
Volaid=Vol[index:index + 12]
# print(Volaid)
for Video_list in Video_lists:
if Video_list['aid'] == Volaid:
is_exists = 1
# print('找到:', Video_list)
else:
continue
if is_exists == 0:
print('视频:【', Vol, '】不在收藏夹中,属于已失效视频')
return 0, Vol
return 1, Vol
#搜索列表中元素是否在本地
def Is_exists_in_native(Video_list, Vols):
is_exists = 0
aid=Video_list['aid']
for Vol in Vols:
Volaid = index = Vol.find('BV')
Volaid = Vol[index:index + 12]
if aid == Volaid:
is_exists = 1
# print('找到:', Video_list)
else:
continue
if is_exists == 0:
print('视频:【', aid, '】不在本地,属于未下载视频')
return 0, Video_list
return 1, Video_list
# 存储列表
def save_list(video_lists, path, name):
# 转变工作目录到指定文件夹
os.chdir(path)
# 创建DataFrame对象
df = pd.DataFrame(video_lists)
# 保存为Excel文件
pd.DataFrame(video_lists, columns=['aid', "title", "url"]). \
to_excel(file_path + name, index=False)
# 主函数
file_path = 'D:\\下载站\\bilibili\\download\\' # 列表储存路径
folder_path = 'D:\\下载站\\bilibili\\download\\1' # 下载文件储存路径
# 获取到列表Video_lists
Video_lists = Excel_List(file_path, 'name.xlsx')
# 获取文件夹合集到Vols
Vols = Vols_List(folder_path)
print('\n')
# 检查元素是否存在于字典列表中,若存在,则爬取信息重命名,若不存在,则移动到(已失效)文件夹
dead_video_list = [] # 失效视频放这里
unload_video_list = [] # 未下载视频放这里
for Vol in Vols:
(is_Existence, vol) = Is_exists_in_favorite(Vol, Video_lists)
if is_Existence == 0:
dead_video_list.append(vol)
print('\n')
for Video_list in Video_lists:
(is_Existence, vol) = Is_exists_in_native(Video_list, Vols)
if is_Existence == 0:
unload_video_list.append(vol)
print('\n')
print('失效视频列表:',len(dead_video_list))
for dead_video in dead_video_list:print(dead_video)
print('\n')
print('未下载视频列表:',len(unload_video_list))
for unload_video in unload_video_list:print(unload_video)
save_list(unload_video_list, file_path, 'unload_video_list.xlsx')