本文提供一个简单的下载b站视频的脚本。
一般爬取视频的话肯定要找到其对应的url,有时候可能是以路径的形式隐藏在源码里,有时候是通过Ajax动态填充到源码里。
找到其对应的url后,再通过urllib,requests等库来模拟浏览器发送请求获取二进制码。
这就是最基本的爬取思路,接下来就来实现。
比如说这个视频,url:https://www.bilibili.com/video/BV1Eb411u7Fw?p=5&vd_source=a710d8ce6c660aea6bcb6930aeede828观察一下视频播放页面的源码,发现视频的url在里面
那就来用requests库来获取其html然后用BeautifulSoup或者re等库来截取出其中的url。先试试看获取的html有没有问题,然后再截取。
import requests
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0',
'Cookie': 'buvid4=18A5B657-1D1F-BA5A-7982-66CC7BF00FBB41225-023062217-9TO%2BhWhxQHPbLQjQPTC0Pz2xncxATKt8FZh8u%2Bs0tmucIcZ2qIW07Q%3D%3D; DedeUserID=488983777; DedeUserID__ckMd5=9ab81020f43bec82; rpdid=0zbfAHHhNv|j1FjL4bJ|32f|3w1QcjTL; buvid_fp_plain=undefined; CURRENT_BLACKGAP=0; enable_web_push=DISABLE; FEED_LIVE_VERSION=V_WATCHLATER_PIP_WINDOW3; LIVE_BUVID=AUTO7417142164692956; PVID=1; buvid3=5947BDBA-D2A7-299E-988D-229FEF4AF69227989infoc; b_nut=1718967227; _uuid=4D10E68A2-4BB3-710810-346D-12EC56FAA67724004infoc; header_theme_version=CLOSE; CURRENT_QUALITY=116; fingerprint=9140a9275e3e3da6d9e8696c9347c3e0; home_feed_column=5; browser_resolution=1659-804; buvid_fp=9140a9275e3e3da6d9e8696c9347c3e0; SESSDATA=4074c6bf%2C1736075142%2C63631%2A72CjBDomz8sU7da-r6gnLzWuuttvD2h-M2doq1SW6Ar9HddH9cTTpEhrknf2QAIKgPXAMSVlptM3c2Uk9ISXBMcGJCRHZGZ3FHNmxyUlhBTHlXTGFSbnZ5Q3k0N2JHbjVLT25WeXF2Q291cmo2Y192UjBoMzI5aGNiNUJNaVU0eFl0UWpsM0h2SmFnIIEC; bili_jct=6abc5a9b3e9c924a5ab7bbe988c7089c; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MjA5NTAyODYsImlhdCI6MTcyMDY5MTAyNiwicGx0IjotMX0.UDMELCM9Ag5RdG05T0IG9y28UJf6idQMQmafDB8yJTs; bili_ticket_expires=1720950226; sid=5srz5nm6; b_lsid=864484C10_190A645A413; bp_t_offset_488983777=953217229128728576; CURRENT_FNVAL=4048'}
url = 'https://www.bilibili.com/video/BV1Eb411u7Fw?p=5&vd_source=a710d8ce6c660aea6bcb6930aeede828'
response = requests.get(url, headers=headers)
print(response.text)
发现返回的html有视频的url,但是多多观察一下html会发现,返回的html里的url最高画质也就才32(对应清晰)。
这按道理来说不应该的,因为我设置了我自己的会员账号的cookie,按理来说能看最高画质的,但这里就只有清晰。会不会是视频原本就只有清晰画质呢?查看了一下视频,发现并不是这样。那就奇怪了,再试试访问其它视频,会发现有些时候最高画质是112(1080P+)有些时候就是32。不稳定。这就不对了,我们看视频肯定要看高画质的,低画质的看着有什么意思。
这个问题我之前批量下载视频的时候也遇到过,一时没有解决。有没有大佬知道这是什么原因的,欢迎评论区里说一下。
后来我用pyppeteer来爬取发现就没有这个问题了。
import requests
import re
from pyppeteer import launch
import asyncio
async def main():
browser = await launch(headless=False, userDataDir='./userdata', args=['--disable-infobars'])
page = await browser.newPage()
await page.goto('https://www.bilibili.com/video/BV1Eb411u7Fw?p=5&vd_source=a710d8ce6c660aea6bcb6930aeede828')
await asyncio.sleep(10)
print(await page.content())
await page.close()
if __name__ == '__main__':
asyncio.run(main())
我猜测,可能b站有时候把url放在静态网页里,有时候动态地穿插进去。可能也是一种反爬手段吧。
所以,有了这个问题,就只能用模拟浏览器的三方库来爬取视频了。
这里我就用pyppeteer库来写。pyppeteer库可以保存用户数据,使用户数据持久化。只需要用户自己登陆一次就可以了。
首先,获取用户的数据并保存:
from pyppeteer import launch
import asyncio
async def main():
browser = await launch(headless=False, userDataDir='./userdata')
page = await browser.newPage()
await page.goto('https://passport.bilibili.com/login')
if __name__ == '__main__':
asyncio.run(main())
登录完后控制台可能会报错,这个·没关系,只要有userdata这个文件夹出现就行。如果大佬能解决这个协程报错问题的话可以在评论区里说一下,我好学习学习。
然后就是主程序:
import requests
import re
from pyppeteer import launch
from lxml import etree
import os
import subprocess
import asyncio
from tenacity import retry, stop_after_attempt
from pathlib import Path # 将找出相对路径的绝对位置,以用于无头模式
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0'}
path = Path('./userdata') # 用于设置无头模式
@retry(stop=stop_after_attempt(2))
async def scrape(url):
browser = await launch(headless=True, userDataDir=path.resolve(),
args=['--User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.36 Edg/121.0.0.0'])
page = await browser.newPage()
await page.goto(url)
await asyncio.sleep(2)
content = await page.content()
if content is None:
raise ValueError
else:
return content
def get_url(html):
# delete = re.sub('_哔哩哔哩_bilibili', '', html)
result = etree.HTML(html)
content = str(result.xpath('//script[4]/text()'))
common_video_pattern = re.compile('.*?"baseUrl":"(.*?)"', re.S)
common_music_pattern = re.compile('.*?"id":30216,"baseUrl":"(.*?)"', re.S)
vip_video_pattern = re.compile(r'.*?"backupUrl":\["(.*?)"]', re.S)
vip_music_pattern = re.compile('"mimeType":"audio/mp4".*?"baseUrl":"(.*?)"', re.S)
name_pattern = re.compile('<meta.*?name="keywords" content="(.*?),.*?">', re.S)
common_video = re.search(common_video_pattern, content).group(1) if re.search(common_video_pattern, content) else None
common_music = re.search(common_music_pattern, content).group(1) if re.search(common_music_pattern, content) else None
vip_video = re.search(vip_video_pattern, content).group(1) if re.search(vip_video_pattern, content) else None
vip_music = re.search(vip_music_pattern, content).group(1) if re.search(vip_music_pattern, content) else None
name = re.search(name_pattern, html).group(1) if re.search(name_pattern, html) else None
if name is not None:
name = name.replace(' ', '')
else:
name = 'name匹配失败,自己手动填'
return {'common_video': common_video,
'common_music': common_music,
'vip_video': vip_video,
'vip_music': vip_music,
'name': name}
def get_path(video_path, music_path, name):
video = f'{video_path}//{name}.mp4'
music = f'{music_path}//{name}.mp3'
return video, music
def transform_bytes(url1, url2, path): # url1是从html里截取的url,url2是原本视频页面的url
if url1 is not None:
headers['Referer'] = url2
response = requests.get(url1, headers=headers, timeout=None, stream=True) # 分流式下载视频内容
if response.status_code == 200:
with open(f'{path}', 'wb') as file:
for chunk in response.iter_content(chunk_size=1024):
file.write(chunk)
return True
else:
return False
def makefile(path):
video_path = f'{path}\\video'
music_path = f'{path}\\music'
if os.path.exists(video_path) and os.path.exists(music_path):
return video_path, music_path
else:
os.makedirs(video_path)
os.makedirs(music_path)
return video_path, music_path
def combine_video_music(name, input_path, video_path, music_path):
print("开始合并数据")
order = f'ffmpeg -i {video_path}\\{name}.mp4 -i {music_path}\\{name}.mp3 -c copy {input_path}\\{name}.mp4 -loglevel quiet'
subprocess.run(order)
if os.path.exists(f'{input_path}\\{name}.mp4'):
print('合并成功啦!!!')
else:
print('呜呜呜,对不起~合并失败了,没有找到合并好的文件QAQ')
os.remove(f'{video_path}\\{name}.mp4')
os.remove(f'{music_path}\\{name}.mp3')
async def main():
url = input('请输入视频url(可以是会员视频或电影):')
path = input('请输入保存路径:')
video_path = makefile(path)[0]
music_path = makefile(path)[1]
print('正在爬取中...')
html = await scrape(url)
url_dic = get_url(html)
common_video = url_dic['common_video']
common_music = url_dic['common_music']
vip_video = url_dic['vip_video']
vip_music = url_dic['vip_music']
name = url_dic['name']
the_path = get_path(video_path, music_path, name)
video = the_path[0]
music = the_path[1]
'''转换视频二进制'''
print('开始转换视频二进制...')
video_contents = transform_bytes(common_video, url, video)
if not video_contents:
video_contents = transform_bytes(vip_video, url, video)
'''转换音频二进制'''
print('开始转换音频二进制...')
music_contents = transform_bytes(common_music, url, music)
if not music_contents:
music_contents = transform_bytes(vip_music, url, music)
# save_data(video_contents, music_contents, name, video_path, music_path)
combine_video_music(name, path, video_path, music_path)
if __name__ == '__main__':
asyncio.run(main())
我电脑是可以运行的,如果有其他错误的话并且确定是无头模式设置的问题的话可以看看这篇博客
https://blog.csdn.net/Blinger/article/details/108369471
注意这两个代码是分开的哦,当然还要准备一些三方库和ffmpeg。
到此就基本实现了下载b站视频的脚本。
接下来成果展示: