应用版本:PyCharm 2023.2.4、Python 3.11.3
结果展示:
涉及依赖:
import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
from multiprocessing import Pool, freeze_support, RLock
代码:
import requests
from bs4 import BeautifulSoup
import json
from tqdm import tqdm
from multiprocessing import Pool, freeze_support, RLock
headers = {
# 根据自己浏览器配置
"User-Agent": ""
}
# 文件保存的绝对路径
file_path = "C:\\Users\\Desktop\\video\\"
def download_videos_thread(videoWebUrls):
freeze_support()
pool = Pool(len(videoWebUrls), initializer=tqdm.set_lock, initargs=(RLock(),))
pool.map(download_video, videoWebUrls)
def download_video(videoWebUrl):
soup = getHTMLText(videoWebUrl, headers)
title = soup.find('title').text
videoUrl = soup.find_all('video')[0].get('src')
# 进度条
with requests.get(videoUrl, stream=True) as r:
r.raise_for_status()
# 获取文件大小
total_size = int(r.headers.get('Content-Length', 0))
block_size = 1024
progress_bar = tqdm(total=total_size, unit='B', unit_scale=True, desc=title + '--下载进度:', colour='blue')
with open(file_path + title + '.mp4', 'wb') as f:
for chunk in r.iter_content(block_size):
if chunk:
f.write(chunk)
progress_bar.update(len(chunk))
progress_bar.close()
def get_videos_url(rootUrl):
soup = getHTMLText(rootUrl, headers)
# 解析script内容
# loads = json.loads(soup.find("script", {"data-test-id": "schema_org_data-0"}).get_text())
json_loads = json.loads(soup.find('script', {'type': 'application/ld+json'}).get_text())
graphs = json_loads.get('@graph')
global items
for graph in graphs:
if graph.get('@type') == 'ItemList':
items = graph.get('itemListElement')
# 获取每个视频网页url
videoWebUrls = []
for item in items:
url = item.get('@id')
videoWebUrls.append(url)
return videoWebUrls
# 访问网页并返回HTML相关的信息
def getHTMLText(url, headers):
# 向目标服务器发起请求并返回响应
try:
r = requests.get(url=url, headers=headers, timeout=20)
r.encoding = r.apparent_encoding
soup = BeautifulSoup(r.text, "html.parser")
return soup
except:
return ""
if __name__ == '__main__':
# 根url
rootUrl = 'https://mixkit.co/free-stock-video/nature/'
# 获取视频所在地址
videoWebUrls = get_videos_url(rootUrl)
# 多线程下载
download_videos_thread(videoWebUrls)
仅供参考学习