import json
import os
import subprocess
import requests
import re
def get_response(url):
headers = {
'referer': 'http://www.bilibili.com/',
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:98.0) Gecko/20100101 Firefox/98.0'
}
return requests.get(url=url, headers=headers)
def get_video_info(url):
url_resp = get_response(url)
video_title = re.findall('<h1 title="(.*?)"', url_resp.text)[0]
html_data = re.findall('<script>window.__playinfo__=(.*?)</script>', url_resp.text)[0]
json_data = json.loads(html_data)
# pprint.pprint(json_data)
audio_url = json_data['data']['dash']['audio'][0]['baseUrl']
video_url = json_data['data']['dash']['video'][0]['baseUrl']
return [video_title, video_url, audio_url]
def mergeVideoAudio(video_name, num, no):
abs_pth = os.path.abspath(__file__) + '/..'
# print('开始合并video 和 audio', video_name)
video_name = video_name.replace(' ', '')
# print(video_name)
output_name = f'{no}_' + ' '.join(num.split(' ')[:-1])
cmd = 'ffmpeg -i ' + '\"output/' + f'{num}.mp4\" ' + ' -i ' + '\"output/' + f'{num}.mp3\"' + ' -acodec copy -vcodec copy ' + "\"output/" + f'{output_name}\"' + '.mp4'
subprocess.call(cmd, shell=True)
os.remove(f'./output/{num}.mp3')
os.remove(f'./output/{num}.mp4')
# print(f'{num}视频合成完成', video_name)
def save(info_list, num, no):
audio_content = get_response(info_list[2]).content
video_content = get_response(info_list[1]).content
title = info_list[0]
num = num.replace('/', '_')
if not os.path.exists('./output'):
os.makedirs('./output')
with open('./output/' + str(num) + '.mp3', 'wb') as f:
f.write(audio_content)
with open('./output/' + str(num) + '.mp4', 'wb') as f:
f.write(video_content)
print(f'video_{num}爬取成功!')
mergeVideoAudio('./output/' + str(num), num, no)
def get_list(url):
resp = get_response(url)
content = json.loads(resp.text)
tmp = content['data']['archives']
bvid_list = []
title_list = []
for v in tmp:
bvid_list.append(v['bvid'])
title_list.append(v['title'])
return bvid_list, title_list
if __name__ == '__main__':
bvid_list, title_list = get_list(
'https://api.bilibili.com/x/polymer/space/seasons_archives_list?mid=202224425&season_id=6895&sort_reverse=false&page_num=1&page_size=30')
# print(title_list)
# exit(0)
for n in range(6, 7):
bvi = bvid_list[n]
title = title_list[n]
url = f'https://www.bilibili.com/video/{bvi}/?spm_id_from=333.788&vd_source=270c4500eb11ca999c55b6d79d7ae5f8'
video_info = get_video_info(url)
save(info_list=video_info, num=title, no=n)
print(f'{n} is finished')
# BV1fY4y137yh
# BV1R34y1L7sY
python 爬虫 某站
最新推荐文章于 2024-08-09 00:05:38 发布