采集的数据
- 采集高清视频
- 视频标题
- 播放量,点赞数,收藏数
- 视频标签
实现步骤
- 获取cookie
- 请求视频Url, 获取视频html源码
- 解析html, 获取标题,播放量,点赞数,收藏数,视频标签等信息
- 解析html, 获取最高清视频及音频url
- 保存视频文件,音频文件
- 合并视频和音频文件
采集入口
使用视频播放页面作为采集入口,使用视频播放页面的url作为参数进行采集。
获取Cookie
采集前先打开浏览器获取登录后的cookie,不然请求获得的数据不全,不登录情况下的请求返回没有包含高清的视频地址,只有较低清晰度的地址。
复制出请求头中的Cookie的值,粘贴到代码中。
headers = {
"referer":"https://www.bilibili.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Cookie": ""
}
response = requests.get(url=url, headers=headers)
print("返回200,则网页请求成功:", response)
请求视频Url, 获取视频html地址
b站视频的下载地址可以直接在视频页面HTML中获取到,通过下面的正则可以获取到完整的json对象,并在其中存放着不同清晰度的地址。
html_data = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
json_data=json.loads(html_data)
json_dicts = json.dumps(json_data,indent=4)
max_bandwidth_video = ''
max_bandwidth = 0
num = len(json_data["data"]["dash"]["video"])
for i in range(num):
bandwidth = json_data["data"]["dash"]["video"][i]["bandwidth"]
if bandwidth > max_bandwidth:
max_bandwidth = bandwidth
max_bandwidth_video = json_data["data"]["dash"]["video"][i]["baseUrl"]
print("最大码率视频地址为:", max_bandwidth_video)
print("最大码率为:", max_bandwidth)
解析html, 获取标题,播放量,点赞数,收藏数,视频标签等信息
视频各种信息主要通过正则表达式或者xpath获取,这个根据个人喜好来
# 获取视频标题
title = re.findall('<h1 title="(.*?)"', response.text)[0]
# 如果标题里有[\/:*?<>|]特殊字符,直接删除
title = re.sub(r"[\/:*?<>|]","",title)
print("视频标题为:",title)
# 获取视频描述
# '/html/body/div[2]/div[2]/div[1]/div[4]/div[1]/div[1]/span/text()'
desc = lxml.etree.HTML(response.text).xpath('/html/body/div[2]/div[2]/div[1]/div[4]/div[1]/div[1]/span/text()')
print("视频描述为:",desc)
# tags
# /html/body/div[2]/div[2]/div[1]/div[4]/div[2]/div/div[1]/div/a
tags = lxml.etree.HTML(response.text).xpath('/html/body/div[2]/div[2]/div[1]/div[4]/div[2]/div/div/div/a/text()')
print("视频标签为:",tags)
# 点赞数量
# /html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div[1]/div/span
like = lxml.etree.HTML(response.text).xpath('/html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div[1]/div/span/text()')
print("点赞数量为:",like)
# 投币数量
# /html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div[2]/div/span
coin = lxml.etree.HTML(response.text).xpath('/html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div[2]/div/span/text()')
print("投币数量为:",coin)
# 收藏数量
# /html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div[3]/div/span
collect = lxml.etree.HTML(response.text).xpath('/html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div[3]/div/span/text()')
print("收藏数量为:",collect)
# 播放数量
# /html/body/div[2]/div[2]/div[1]/div[1]/div/div/span[1]
play = lxml.etree.HTML(response.text).xpath('/html/body/div[2]/div[2]/div[1]/div[1]/div/div/span[1]/text()')
print("播放数量为:",play)
解析html, 获取视频及音频url
json对象data.dash.video下存储着不同清晰度的视频链接,我们这里直接获取带宽最大的,即为最高清的。
b站中的视频文件和音频文件是分开的,要都采集下来再合并。
html_data = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
json_data=json.loads(html_data)
json_dicts = json.dumps(json_data,indent=4)
max_bandwidth_video = ''
max_bandwidth = 0
num = len(json_data["data"]["dash"]["video"])
for i in range(num):
bandwidth = json_data["data"]["dash"]["video"][i]["bandwidth"]
if bandwidth > max_bandwidth:
max_bandwidth = bandwidth
max_bandwidth_video = json_data["data"]["dash"]["video"][i]["baseUrl"]
print("最大码率视频地址为:", max_bandwidth_video)
print("最大码率为:", max_bandwidth)
audio_url = json_data["data"]["dash"]["audio"][0]["baseUrl"]
print("音频地址为:", audio_url)
保存视频文件,音频文件
video_content = requests.get(url=max_bandwidth_video,headers=video_header).content
# 创建mp4文件,写入二进制数据
with open (f'./temp/{title}' +".mp4", mode = "wb") as f :
f.write(video_content)
audio_content = requests.get(url=audio_url,headers=video_header).content
# 创建mp4文件,写入二进制数据
with open (f'./temp/{title}'+".mp3", mode = "wb") as f :
f.write(audio_content)
print("数据写入成功!")
合并视频和音频文件
采用ffmpeg工具来将视频文件和音频文件进行合并,python中可以通过subprocess模块来调用ffmpeg,通过命令行传参的方式来指定视频路径和音频路径
ffmpeg下载及使用教程
https://blog.csdn.net/HYEHYEHYE/article/details/122000352
# 合并音视频
subprocess.run("ffmpeg -i ./temp/"+title+".mp4 -i ./temp/"+title+".mp3 -c copy "+title+"_new.mp4", shell=True)
完整代码
import requests
import re
import json
import subprocess
import os
import time
# xpath解析库
import lxml.etree
url = "https://www.bilibili.com/video/BV1a94y1r7cA/?spm_id_from=333.999.0.0&vd_source=5a4735214a209e1bebb4e5a85701ed93"
headers = {
"referer":"https://www.bilibili.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
"Cookie": "buvid3=BA1E739E-0809-2475-97DD-E58D4BEC418638521infoc; b_nut=1682651738; _uuid=91048C7C4-AB48-5FD5-ABE6-573CFDCC373D42674infoc; buvid4=DDC40E23-64EC-1DDE-8CFC-47E73C5751DA39361-023042811-fPFu10NyFO289IqHdLqHoQ%3D%3D; i-wanna-go-back=-1; FEED_LIVE_VERSION=V8; header_theme_version=CLOSE; nostalgia_conf=-1; CURRENT_PID=b7e98f20-ea25-11ed-830e-57cc8ef04574; rpdid=|(m)~kYmmJ|0J'uY)JuY)JkR; DedeUserID=402137245; DedeUserID__ckMd5=f90c569b7ef218ce; b_ut=5; buvid_fp_plain=undefined; LIVE_BUVID=AUTO2016894803706410; enable_web_push=DISABLE; CURRENT_FNVAL=4048; fingerprint=d579292d500cc9932d44b28e8146d0f9; PVID=1; buvid_fp=d579292d500cc9932d44b28e8146d0f9; CURRENT_QUALITY=80; home_feed_column=5; browser_resolution=1536-769; b_lsid=A5F46371_18DD3C203F1; bili_ticket=eyJhbGciOiJIUzI1NiIsImtpZCI6InMwMyIsInR5cCI6IkpXVCJ9.eyJleHAiOjE3MDg5MTM5MjgsImlhdCI6MTcwODY1NDY2OCwicGx0IjotMX0.XyaWKeyeJXXQnaDjppT00_mqGW-cyZIIpraHrnkzq7U; bili_ticket_expires=1708913868; SESSDATA=1f035437%2C1724206732%2C233ec%2A21CjARenfJNW9yna6TAk7Nc3MgK54ROdev3AfkgbWAQj9MpsQR2VGuEsXdQ8Vl1Sv3eJASVmdrNFJOM3YxZTJJdTEwRGdJSUhYTENudXJScmJBcURPX3AzM2x2djlOVjdCVExhTkVEYnZwbzVwMktjMW5FUG5MMTNQdjhvWVZ0SmllWjQ0aGhRcG1BIIEC; bili_jct=695249d64b478cdd2b7ce263fbe65607; sid=nstfzvct; bp_video_offset_402137245=901118511340847111"
}
video_header = {
"referer": url,
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36"
}
response = requests.get(url=url, headers=headers)
# 时间戳
timestamp = time.strftime("%Y%m%d%H%M%S", time.localtime())
# 创建文件夹
if not os.path.exists(f"./bilibili/temp/{timestamp}"):
os.makedirs(f"./bilibili/temp/{timestamp}")
# 保存网页源码
with open(f"./bilibili/temp/{timestamp}/index.html", "w", encoding="utf-8") as f:
f.write(response.text)
# 获取视频标题
title = re.findall('<h1 title="(.*?)"', response.text)[0]
# 如果标题里有[\/:*?<>|]特殊字符,直接删除
title = re.sub(r"[\/:*?<>|]","",title)
print("视频标题为:",title)
# 获取视频描述
# '/html/body/div[2]/div[2]/div[1]/div[4]/div[1]/div[1]/span/text()'
desc = lxml.etree.HTML(response.text).xpath('/html/body/div[2]/div[2]/div[1]/div[4]/div[1]/div[1]/span/text()')
print("视频描述为:",desc)
# tags
# /html/body/div[2]/div[2]/div[1]/div[4]/div[2]/div/div[1]/div/a
tags = lxml.etree.HTML(response.text).xpath('/html/body/div[2]/div[2]/div[1]/div[4]/div[2]/div/div/div/a/text()')
print("视频标签为:",tags)
# 点赞数量
# /html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div[1]/div/span
like = lxml.etree.HTML(response.text).xpath('/html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div[1]/div/span/text()')
print("点赞数量为:",like)
# 投币数量
# /html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div[2]/div/span
coin = lxml.etree.HTML(response.text).xpath('/html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div[2]/div/span/text()')
print("投币数量为:",coin)
# 收藏数量
# /html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div[3]/div/span
collect = lxml.etree.HTML(response.text).xpath('/html/body/div[2]/div[2]/div[1]/div[3]/div[1]/div[1]/div[3]/div/span/text()')
print("收藏数量为:",collect)
# 播放数量
# /html/body/div[2]/div[2]/div[1]/div[1]/div/div/span[1]
play = lxml.etree.HTML(response.text).xpath('/html/body/div[2]/div[2]/div[1]/div[1]/div/div/span[1]/text()')
print("播放数量为:",play)
html_data = re.findall('<script>window.__playinfo__=(.*?)</script>', response.text)[0]
with open(f"./bilibili/temp/{timestamp}/data.json", "w", encoding="utf-8") as f:
f.write(html_data)
# 结束执行
# exit()
json_data=json.loads(html_data)
json_dicts = json.dumps(json_data,indent=4)
max_bandwidth_video = ''
max_bandwidth = 0
num = len(json_data["data"]["dash"]["video"])
for i in range(num):
bandwidth = json_data["data"]["dash"]["video"][i]["bandwidth"]
if bandwidth > max_bandwidth:
max_bandwidth = bandwidth
max_bandwidth_video = json_data["data"]["dash"]["video"][i]["baseUrl"]
print("最大码率视频地址为:", max_bandwidth_video)
print("最大码率为:", max_bandwidth)
# exit()
video_content = requests.get(url=max_bandwidth_video,headers=video_header).content
# 创建mp4文件,写入二进制数据
with open (f'./temp/{title}' +".mp4", mode = "wb") as f :
f.write(video_content)
# print("视频画面数量为:", num)
# print("视频画面地址为:", video_url)
audio_url = json_data["data"]["dash"]["audio"][0]["baseUrl"]
print("音频地址为:", audio_url)
audio_content = requests.get(url=audio_url,headers=video_header).content
# 创建mp4文件,写入二进制数据
with open (f'./temp/{title}'+".mp3", mode = "wb") as f :
f.write(audio_content)
print("数据写入成功!")
# 合并音视频
subprocess.run("ffmpeg -i ./temp/"+title+".mp4 -i ./temp/"+title+".mp3 -c copy "+title+"_new.mp4", shell=True)