网上找个网站,视频手动一个一个下载,太麻烦了,怎么办?用某雷,out了
网页爬虫多线程下载视频步骤:
- 引入requests 访问网页内容,用正则解析提取url
- 分析html页得到mp4地址
- 把url存入线程安全的queue
- 多线程获到queue内的mp4地址,同时下载
import requests
import re
import os
import queue
import threading
import shutil
def download_start():
download_url_queue = queue.Queue(3)
mp4_code_set = set()
page = 10
store_location = '/Users/Downloads/.dyxx/' #存储的地址
download_site_home = "https://xxxxxx.com/" #下载视的地址,这个需要你自己到网上发掘了
mp4_api_url = 'https://api.xxxxxx.com/get-mp4-url?code=' #通过下载片源地址获取code, 通过code获得播放mp4的地址
def download():
while True:
if not download_url_queue.empty():
mp4_url = download_url_queue.get()
try:
file_path = store_location + mp4_url[-15:]
if not os.path.exists(file_path):
print('Download start::::' + mp4_url)
res_header = requests.head(mp4_url)
if res_header.headers['Content-Type'] == 'video/mp4':
with open(file_path, "wb") as f, requests.get(mp4_url, stream=True) as res:
shutil.copyfileobj(res.raw, f)
print('Download end::::' + mp4_url)
except Exception as ee:
print(str(ee))
pass
for t in range(5):
threading.Thread(target=download).start()
while True:
try:
download_pages = download_site_home+'?page=' + str(page)
res = requests.get(download_pages)
if res.status_code == 200:
re_href = re.compile(r'href="/\d{4}/[^"]*')
all_href = re_href.findall(res.text)
all_href.reverse()
all_href_set = set(all_href[15:-15])
for href_item in all_href_set:
play_page = download_site_home + href_item.replace('href="/', '')
play_page_res = requests.get(play_page)
if play_page_res.status_code == 200:
play_page_text = play_page_res.text
re_play_code = re.compile(r'data-code="[^"]*')
mp4_play_codes = re_play_code.findall(play_page_text)
mp4_play_codes_set = set(mp4_play_codes)
for code in mp4_play_codes_set:
param_code = code.replace('data-code="', '')
if param_code in mp4_code_set:
break
else:
mp4_code_set.add(param_code)
mp4url = mp4_api_url + param_code
mp4res = requests.get(mp4url)
if mp4res.status_code == 200:
file_path = store_location + mp4res.text[-15:]
if os.path.exists(file_path):
break;
print(mp4res.text + ' ' + param_code)
download_url_queue.put(mp4res.text)
page = page + 1
except Exception as e:
pass
if __name__ == '__main__':
download_start()