b站视频爬取

最新推荐文章于 2024-09-12 10:42:37 发布

点滴瓶

最新推荐文章于 2024-09-12 10:42:37 发布

阅读量358

点赞数 4

分类专栏：爬虫文章标签：音视频 python 前端爬虫网络爬虫

本文链接：https://blog.csdn.net/2301_78056296/article/details/141271357

版权

爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

免责声明

文章所涉及内容，仅供安全研究与教学之用，由于传播、利用本文所提供的信息而造成的任何直接或者间接的后果及损失，均由使用者本人负责，作者不为此承担任何法律及连带责任。

爬取单个视频

需要视频的bvid号

最终代码

需要更换为自己的cookie和header

import time
from tqdm import tqdm
import requests
import json
import re,os
from moviepy.editor import AudioFileClip, VideoFileClip, CompositeVideoClip

cookies = {
}

headers = {
}

params = {
    'spm_id_from': '333.1007.tianma.1-2-2.click',
    'vd_source': '367145f6e11228ae3f9f416af6f5cd2e',
}
#下载视频和音频url, headers, output_path, title
def download_vedio_radio(url,output_path,title):
    if not os.path.exists("result"):
        os.makedirs("result")
    # 定义存放位置
    save_dir = os.getcwd()+output_path#文件夹绝对路径
    # 下载视频
    response = requests.get(url, headers=headers, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024  # 每次读取的字节数
    progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True, desc=f"正在下载{title}")
    with open(save_dir, 'wb') as file:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            file.write(data)
    progress_bar.close()
    return save_dir
#获取视频音频地址
def get_video_url(vb):
    response = requests.get(url="https://www.bilibili.com/video/"+vb, params=params, cookies=cookies,
                            headers=headers)
    text_data = response.text
    # print(text_data)
    # urls=json.loads(json_data)
    title = re.findall('<title data-vue-meta="true">(.*)</title> ', text_data)[0]
    title = title.replace('_哔哩哔哩_bilibili', '')
    json_video_info = re.findall('window.__playinfo__=(.*?)</script>', text_data)[0]
    dict_data = json.loads(json_video_info)
    video_url = dict_data['data']['dash']['video'][0]['base_url']
    audio_url = dict_data['data']['dash']['audio'][0]['baseUrl']
    # 下载视频和音频
    return video_url, audio_url, title
import subprocess
#合并视频音频
def merge_audio_video(video_file, audio_file):
    start_time = time.time()
    outfile_name = video_file.split('.')[0] + '-.mp4'
    cmd = f'ffmpeg -i {video_file} -i {audio_file} -acodec copy -vcodec copy {outfile_name} -y'
    subprocess.call(cmd,shell=True)
    end_time = time.time()
    print(f'{outfile_name} 合并成功，耗时{round(end_time - start_time, 1)}秒')

#删已经完成合并的音频和视频
def delete_file(directory, file_name):
    file_path = os.path.join(directory, file_name)
    file_paths = [file_path + ext for ext in (".mp4", ".mp3")]
    for path in file_paths:
        if os.path.exists(path):
            os.remove(path)
            print(f"{file_name}已成功删除。")
        else:
            print(f"{file_name}不存在。")
#获取一个视频
def get_one_video_url(vb):
    try:
        video_info = get_video_url(vb)
        video_url = video_info[0]
        audio_url = video_info[1]
        title = video_info[2]
        #下载视频和音频
        download_vedio_radio(video_url, '/result/' +title+ '.mp4', title)
        download_vedio_radio(audio_url, '/result/' +title+ '.mp3', title)
        # 合并视频音频
        result_file_dir = os.getcwd() + "/result/" + title
        merge_audio_video(result_file_dir+'.mp4',result_file_dir+'.mp3')
        # 删除已经完成合并的音频和视频
        delete_file(os.getcwd()+"//result", title)
    except Exception as e:
        print(e)

if __name__ == '__main__':
    start_time=time.time()
    vb="BV1VG411k77r"
    get_one_video_url(vb)
    end_time=time.time()
    print("耗时：",round(end_time-start_time, 1),"秒")
    # download_vedio_radio()

步骤讲解

获取视频和音频url链接

需要自定义headers，params和cookies，如果觉得麻烦，有快捷方式

快捷生成请求的网站Convert curl commands to code，找到发送xhr数据包的请求，右击保存为curl

自行更改

def get_video_url(vb):
    response = requests.get(url="https://www.bilibili.com/video/"+vb, params=params, cookies=cookies,
                            headers=headers)
    text_data = response.text
    # print(text_data)
    # urls=json.loads(json_data)
    title = re.findall('<title data-vue-meta="true">(.*)</title> ', text_data)[0]
    title = title.replace('_哔哩哔哩_bilibili', '')
    json_video_info = re.findall('window.__playinfo__=(.*?)</script>', text_data)[0]
    dict_data = json.loads(json_video_info)
    video_url = dict_data['data']['dash']['video'][0]['base_url']
    audio_url = dict_data['data']['dash']['audio'][0]['baseUrl']
    return video_url, audio_url, title

下载视频和音频

def download_vedio_radio(url,output_path,title):
    if not os.path.exists("result"):
        os.makedirs("result")
    # 定义存放位置
    save_dir = os.getcwd()+output_path#文件夹绝对路径
    # 下载视频
    response = requests.get(url, headers=headers, stream=True)
    total_size = int(response.headers.get('content-length', 0))
    block_size = 1024  # 每次读取的字节数
    progress_bar = tqdm(total=total_size, unit='iB', unit_scale=True, desc=f"正在下载{title}")
    with open(save_dir, 'wb') as file:
        for data in response.iter_content(block_size):
            progress_bar.update(len(data))
            file.write(data)
    progress_bar.close()
    return save_dir

合并视频和音频

这里使用了ffmpeg合并视频和音频，速度非常快，去官网下载，将/bin目录添加到环境变量

def merge_audio_video(video_file, audio_file):
    start_time = time.time()
    outfile_name = video_file.split('.')[0] + '-.mp4'
    cmd = f'ffmpeg -i {video_file} -i {audio_file} -acodec copy -vcodec copy {outfile_name} -y'
    subprocess.call(cmd,shell=True)
    end_time = time.time()
    print(f'{outfile_name} 合并成功，耗时{round(end_time - start_time, 1)}秒')

删除已完成合并的音频和视频

def delete_file(directory, file_name):
    file_path = os.path.join(directory, file_name)
    file_paths = [file_path + ext for ext in (".mp4", ".mp3")]
    for path in file_paths:
        if os.path.exists(path):
            os.remove(path)
            print(f"{file_name}已成功删除。")
        else:
            print(f"{file_name}不存在。")

本人新学的poython，希望各位佬多多指教，不惜勿喷