三个脚本:
【Favorites_collection.py】爬取收藏夹
通过selenium爬取个人收藏夹里所有视频的aid(需要登录自己某站账号),然后将视频信息批量存入一个Excel文件
【Download_multiple_video.py】批量视频下载
通过requests,批量下载Excel里面的视频。视频网站登录自己账号,将cookie写入请求头后,会员可以下载1080p,大会员可以下载4K清晰度,游客(无cookie)最高只能下载720P。网站视频音频和视频是分离的,需要下载后合并,合并工具是ffmpeg,下载安装方法【ffmpeg基础】ffmpeg的下载安装 - 知乎 (zhihu.com)是玄学,我的另一台电脑怎么都装不成功。下载完成后,可将视频修改日期改为视频再网上的发布日期(可选功能),每下载一个视频发出提示音(可选)
【Check_video.py】核对(可选)
这个脚本用来比较收藏夹内(Excel)视频和本地(已下载)视频,来检查是否有视频未下载,或者时间久了收藏夹视频失效。逻辑就是:
比较本地某个视频在不在收藏夹内,如果不在,则判定为失效视频;
比较某个收藏夹视频是否在本地,如果不在,则判定为未下载视频。未下载视频将会生成列表,用户可以调取列表重新进行下载。
【Favorites_collection.py】爬取视频网站的收藏夹
# 爬取收藏夹所有作品信息,只读脚本(第一步)
from pprint import pprint
import requests
from selenium import webdriver
from selenium.common import NoSuchElementException
from selenium.webdriver.common.by import By
import pandas as pd
import openpyxl
import os
import time
# 获取当页20个视频的ID
def get_one_page(wd, video_lists):
elements = wd.find_elements(By.CSS_SELECTOR, '[class="fav-video-list clearfix content"]>li>a:nth-child(2)')
for element in elements:
aid = element.get_attribute("href").replace('https://www.xxxxxxx.com/video/', '')
aid = aid.replace('/', '')
title = element.get_attribute("title")
title = replace_x(title)
# 创建一个字典存放网址、aid和作品名称(后续还要加入作者和发布时间)
video_dict = {'aid': aid,
'title': title,
'url': element.get_attribute("href")}
print(video_dict)
# 将字典存入列表中
video_lists.append(video_dict)
return video_lists
def open_web(url):
# 创建 WebDriver 对象
wd = webdriver.Edge()
# 隐式等待时间
wd.implicitly_wait(3)
wd.get(url)
input('请登录后继续。。。')
return wd
# 替换掉不能命名的字符
def replace_x(words):
words = words.replace('/', '')
words = words.replace(':', '')
words = words.replace('*', '')
words = words.replace('?', '')
words = words.replace('〈', '')
words = words.replace('〉', '')
words = words.replace('|', '')
# words = words.replace('', '')
return words
# 存储列表
def save_list(video_lists, path, name):
for video_list in video_lists:
print(video_list)
print('共采集到', len(video_lists), '条视频')
# 转变工作目录到指定文件夹
os.chdir(path)
# 创建DataFrame对象
df = pd.DataFrame(video_lists)
# 保存为Excel文件
pd.DataFrame(video_lists, columns=['aid', "title", "url"]). \
to_excel(file_path + name, index=False)
# 主函数
file_path = 'D:\\下载站\\test\\'
name = 'video_info.xlsx'
# 输入收藏夹页网址
url = 'https://space.xxxxx.com/xxxxxx/favlist?fid=xxxxxxx&ftype=create'
wd = open_web(url)
video_lists = []
# 获取总页数
time.sleep(2)
pages = wd.find_element(By.CSS_SELECTOR, '.be-pager-total').text
pages = pages.replace('共 ', '')
pages = pages.replace(' 页,', '')
pages = int(pages)
print('收藏夹共', pages, '页')
video_lists = get_one_page(wd, video_lists)
for i in range(1, pages + 1):
# 点击下一页
print('正在采集第', i, '页')
try:
wd.find_element(By.CSS_SELECTOR, 'ul.be-pager > li.be-pager-next').click()
time.sleep(2)
video_lists = get_one_page(wd, video_lists)
except:
print('已经到底啦')
save_list(video_lists, file_path, name)
【Download_multiple_video.py】批量视频下载
# 下载多个视频(可选清晰度,可赋值发布时间)
import subprocess # 导入进程模块
import winsound
# 导入格式化输出模块
import os
import re
import requests
import json
from pprint import pprint
import pandas as pd
# 导入时间模块
from datetime import datetime
# 获取到列表Video_lists
def Excel_List(file_path, excel_name):
# 转变工作目录到指定文件夹
os.chdir(file_path)
# 读取 Excel 文件
df = pd.read_excel(excel_name)
# 转换为列表
data = df.values.tolist()
Video_lists = []
for video_list in data:
video_dict = {'aid': video_list[0],
'title': video_list[1],
'url': video_list[2]}
# print(video_dict)
Video_lists.append(video_dict)
# for video_list in Video_lists:
# print(video_list)
print('共收藏了', len(Video_lists), '条视频')
return Video_lists
# 通过ID获取响应的函数
def request_header(url):
# 加上一个请求头,伪装成浏览器(header是一个字典数据类型),F12或右键检查,刷新网页,随便一个数据包都可以获得UA
headers = {
# cookie:用户信息,登录或不登录都有
'cookie': '自己的账号cookie信息',
# 防盗链
'Referer': '任意一个可播放视频网址',
# user-agent:浏览器信息,版本,电脑'
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Edg/119.0.0.0',
'Safari/537.36 Edg/116.0.1938.62 '
}
response = requests.get(url, headers=headers)
# print(response.text)
return response, headers
# 转码(将视频信息转码后提取url)
def transcoding(video_info, select_definition):
# html_data = requests.utils.unquote(video_info)
# 转换数据类型(把字符串数据类型转换为json字典类型)
json_data = json.loads(video_info)
# print(video_info)
# print(json_data)
# print(type(video_info))
# print(type(json_data))
# pprint(json_data)
# B站的数据内容是分为两段的,音频和视频是分开的,因此需要分别提取
# 提取视频连接 键值对取值--通过冒号左边的内容,提取冒号右边的内容
select_definition = 0 # 视频清晰度选择0一般为最高清晰度
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
video_url = json_data['data']['dash']['video'][select_definition]['baseUrl']
print('帧高度:', json_data['data']['dash']['video'][select_definition]['height'],
'帧宽度:', json_data['data']['dash']['video'][select_definition]['width'])
# print(audio_url)
# print(video_url)
# 403 Forbidden 没有访问权限,需要在请求头里面加一个防盗链
return audio_url, video_url
# 保存数据的函数(下载视频和音频并合成)
def Download_Bilibili(video_name, video_url, audio_url, headers): # 保存数据
print('正在下载...')
# print(video_name)
audio_content = requests.get(url=audio_url, headers=headers).content
video_content = requests.get(url=video_url, headers=headers).content
# 保存数据
with open(video_name + '.mp3', mode='wb') as audio:
audio.write(audio_content)
with open(video_name + '.mp4', mode='wb') as video:
video.write(video_content)
# 合成视频和音频
print('正在合成...')
cmd = f"ffmpeg -i {video_name}.mp4 -i {video_name}.mp3 -c:v copy -c:a aac -strict experimental {video_name}output.mp4 -loglevel quiet"
subprocess.run(cmd, shell=True)
# 删除原文件,重命名新文件
os.remove(video_name + '.mp3')
os.remove(video_name + '.mp4')
os.rename(video_name + 'output.mp4', video_name + '.mp4')
# 修改时间的函数
def modify_time(video_name, release_time):
a = datetime.strptime(release_time, '%Y-%m-%d %H:%M:%S')
a.timestamp()
os.utime(video_name + '.mp4', (a.timestamp(), a.timestamp())) # 只能修改 访问时间 与 修改时间(不能修改创建时间)
# 替换掉不能命名的字符
def replace_x(words):
words = words.replace('/', '')
words = words.replace(':', '')
words = words.replace('*', '')
words = words.replace('?', '')
words = words.replace('〈', '')
words = words.replace('〉', '')
words = words.replace('|', '')
words = words.replace(' ', '')
return words
# 发出提示音
def Sound_beep():
duration = 300 # millisecond
freqA4 = 440 # Hz
freqC5 = 523
freqE5 = 659
winsound.Beep(freqA4, duration)
winsound.Beep(freqC5, duration)
winsound.Beep(freqE5, duration)
# 主函数
# 文件保存位置
file_path = 'D:\\下载站\\test\\' # 读取列表和下载路径
# 获取到列表Video_lists
# Video_lists = Excel_List(file_path, 'video_info.xlsx')
Video_lists = Excel_List(file_path, 'unload_video_list.xlsx')
Video_Failed = []
for i in range(0, len(Video_lists)):
try:
print('正在下载第', i + 1, '个视频', ',共', len(Video_lists), '条视频')
print(Video_lists[i])
ID = Video_lists[i]['aid']
# 请求url
url = 'https://www.xxxxxx.com/video/' + ID + '/?spm_id_from=333.999.0.0&vd_source=这个一定要有,不然不能下载高清'
# 将作品ID对应的URL请求响应输出(请求头写在函数里)
(response, headers) = request_header(url)
# 视频信息(找playinfo在哪)
video_info = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
# print('video_info:\t', type(video_info))
# 作者姓名
author_name = \
re.findall(
'<meta data-vue-meta="true" itemprop="author" name="author" content="(.*?)"><meta data-vue-meta="true"',
response.text)[0]
author_name = replace_x(author_name)
author_name = '【' + author_name + '】'
print('作者姓名:\t', author_name)
# 作品文案
title = re.findall('class="video-title" data-v-4f1c0915>(.*?)</h1>', response.text)[0]
title = replace_x(title)
print('作品标题:\t', title)
video_name = file_path + author_name + ID + '【' + title + '】'
# print(video_name)
# 时间
release_time = re.findall('<meta data-vue-meta="true" itemprop="uploadDate" content="(.*?)">', response.text)[0]
print('time:\t', release_time)
# 转码(抖音需要转码,B站不需要)
(audio_url, video_url) = transcoding(video_info, 4) # 0表示最高清晰度,-1表示最低清晰度
# 下载视频
Download_Bilibili(video_name, video_url, audio_url, headers)
# 更正时间
modify_time(video_name, release_time)
# 发出信号
Sound_beep()
except:
Video_Failed.append(Video_lists[i])
print('出现错误,可能导致下载失败!')
print('视频失败列表:', '\n', Video_Failed)
【Check_video.py】核对(可选)
# 比较收藏夹列表和已经下载的视频列表,看看谁没下(本地没有网上有),谁失效(本地有网上没有)
import winsound
# 导入格式化输出模块
import os
import re
import requests
# 导入时间模块
from datetime import datetime
import shutil
import pandas as pd
# 获取收藏夹列表Video_lists
def Excel_List(file_path, name):
# 转变工作目录到指定文件夹
os.chdir(file_path)
# 读取 Excel 文件
df = pd.read_excel(name)
# 转换为列表
data = df.values.tolist()
Video_lists = []
for video_list in data:
video_dict = {'aid': video_list[0],
'title': video_list[1],
'url': video_list[2]}
# print(video_dict)
Video_lists.append(video_dict)
# for video_list in Video_lists:
# print(video_list)
print('共收藏了', len(Video_lists), '条视频')
return Video_lists
# 获取已下载视频到Vols
def Vols_List(folder_path):
# 转变工作目录到指定文件夹
file_list = os.listdir(folder_path)
# Vols = [f.name for f in os.scandir(folder_path) if f.is_dir()] # 查找所有文件夹
Vols = [] # 查找所有视频
for file in file_list:
if file.endswith('.mp4') or file.endswith('.avi') or file.endswith('.mov'):
Vols.append(file)
# for Vol in Vols:
# print(Vol) # 输出所有子文件夹名称
print('共存储了', len(Vols), '条视频')
return Vols
# 搜索文件名是否在收藏夹中
def Is_exists_in_favorite(Vol, Video_lists):
is_exists = 0
Volaid = index = Vol.find('BV')
Volaid=Vol[index:index + 12]
# print(Volaid)
for Video_list in Video_lists:
if Video_list['aid'] == Volaid:
is_exists = 1
# print('找到:', Video_list)
else:
continue
if is_exists == 0:
print('视频:【', Vol, '】不在收藏夹中,属于已失效视频')
return 0, Vol
return 1, Vol
#搜索列表中元素是否在本地
def Is_exists_in_native(Video_list, Vols):
is_exists = 0
aid=Video_list['aid']
for Vol in Vols:
Volaid = index = Vol.find('BV')
Volaid = Vol[index:index + 12]
if aid == Volaid:
is_exists = 1
# print('找到:', Video_list)
else:
continue
if is_exists == 0:
print('视频:【', aid, '】不在本地,属于未下载视频')
return 0, Video_list
return 1, Video_list
# 存储列表
def save_list(video_lists, path, name):
# 转变工作目录到指定文件夹
os.chdir(path)
# 创建DataFrame对象
df = pd.DataFrame(video_lists)
# 保存为Excel文件
pd.DataFrame(video_lists, columns=['aid', "title", "url"]). \
to_excel(file_path + name, index=False)
# 主函数
file_path = 'D:\\下载站\\test\\' # 列表储存路径
folder_path = 'D:\\下载站\\test\\1' # 下载文件储存路径
# 获取到列表Video_lists
Video_lists = Excel_List(file_path, 'video_info.xlsx')
# 获取文件夹合集到Vols
Vols = Vols_List(folder_path)
# 检查元素是否存在于字典列表中,若存在,则爬取信息重命名,若不存在,则移动到(已失效)文件夹
dead_video_list = [] # 失效视频放这里
unload_video_list = [] # 未下载视频放这里
for Vol in Vols:
(is_Existence, vol) = Is_exists_in_favorite(Vol, Video_lists)
if is_Existence == 0:
dead_video_list.append(vol)
for Video_list in Video_lists:
(is_Existence, vol) = Is_exists_in_native(Video_list, Vols)
if is_Existence == 0:
unload_video_list.append(vol)
print('\n')
print('失效视频列表:')
for dead_video in dead_video_list:print(dead_video)
print('\n')
print('未下载视频列表:')
for unload_video in unload_video_list:print(unload_video)
save_list(unload_video_list, file_path, 'unload_video_list.xlsx')