此次练习的目的
-
将B站的视频批量下载下来,格式为flv格式
-
.m4s格式的URL拼接有困难,只能一个一个进行下载,保存下来的格式为mp4
-
.m4s格式的连接是一个为视频连接,一个为音频连接,需要下载后进行合并
-
.m4s格式的连接进行访问的时候,将requests headers中的range参数范围设置一下,即可下载完整视频
-
-
通过网页的解析,找到avid和cid两个参数,通过api解析json数据得出包含flv的视频连接
程序源码
import requests
import pprint
import re
import jsonpath
import os
from tqdm import tqdm
from lxml import etree
import time
def find_url(fir_url):
res = requests.get(fir_url)
tree = etree.HTML(res.text)
ret = tree.xpath('//li[@class="video-item matrix"]')
for info in ret:
url = info.xpath('.//a/@href')
title = info.xpath('.//a/@title')
# print(url[0])
# print(title[0])
urls.append(url[0])
titles.append(title[0])
def find_flv(url):
res = requests.get(url,headers= headers)
#avid = re.findall('href="https://www.bilibili.com/video/av(.*?)/"><link data-vue-meta',res.text)
id = re.findall('cid=(.*?)&aid=(.*?)&attribute=',res.text)
cid = id[0][0]
avid = id[0][1]
url_str = 'https://api.bilibili.com/x/player/playurl?avid={}&cid={}&qn=80&type=&otype=json'
ret = requests.get(url_str.format(avid,cid)).json()
flv_url = jsonpath.jsonpath(ret,'$..backup_url')
return flv_url
# def DownloadFile(url, headers,save_url, file_name):
# try:
# if url is None or save_url is None or file_name is None or headers is None:
# print('参数错误')
# return None
# # 文件夹不存在,则创建文件夹
# folder = os.path.exists(save_url)
# if not folder:
# os.makedirs(save_url)
# # 读取资源
# res = requests.get(url, headers=headers,stream=True)
# total_size = int(int(res.headers["Content-Length"]) / 1024 + 0.5)
# # 获取文件地址
# file_path = os.path.join(save_url, file_name)
#
# # 打开本地文件夹路径file_path,以二进制流方式写入,保存到本地
#
# with open(file_path, 'wb') as fd:
# print('开始下载文件:{},当前文件大小:{}KB'.format(file_name, total_size))
# for chunk in tqdm(iterable=res.iter_content(1024), total=total_size, unit='k', desc=None):
# fd.write(chunk)
# print(file_name + ' 下载完成!')
# except:
# print("程序错误")
def progressbar(url,headers,path,file_name):
if not os.path.exists(path): # 看是否有该文件夹,没有则创建文件夹
os.mkdir(path)
start = time.time() #下载开始时间
response = requests.get(url,headers = headers, stream=True)
size = 0 #初始化已下载大小
chunk_size = 1024 # 每次下载的数据大小
content_size = int(response.headers['content-length']) # 下载文件总大小
try:
if response.status_code == 200: #判断是否响应成功
print('{} Start download,[File size]:{size:.2f} MB'.format(file_name,size = content_size / chunk_size /1024)) #开始下载,显示下载文件大小
filepath = path + file_name #设置图片name,注:必须加上扩展名
with open(filepath,'wb') as file: #显示进度条
for data in response.iter_content(chunk_size = chunk_size):
file.write(data)
size +=len(data)
print('\r'+'[下载进度]:%s%.2f%%' % ('>'*int(size*50/ content_size), float(size / content_size * 100)) ,end=' ')
end = time.time() #下载结束时间
print('\nDownload completed!,times: %.2f秒\n' % (end - start)) #输出下载用时时间
except:
print('Error!')
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.102 Safari/537.36'
}
fir_url = 'https://search.bilibili.com/all?keyword={}&from_source=nav_suggest_new'
word = input("请输入想要搜索的内容:")
urls = []
titles = []
find_url(fir_url.format(word))
i = 0
for aurl in urls:
flv_url = find_flv('http:'+aurl)
# print(flv_url[0][0])
titles[i] = titles[i].replace('/','')
#DownloadFile(flv_url[0][0],headers, './manjian', '{}.flv'.format(titles[i]))
progressbar(flv_url[0][0],headers, './manjian/', '{}.flv'.format(titles[i]))
i+=1
爬取效果
可以改进的方向(不是此次练习的主要目的,没有进行实现)
- 可以讲搜索的内容进行打印,可以进行选择
- 保存的路径可以通过键盘输入进行选择
- 取消下载,删除没有下载完的当前的文件