python 网页爬虫，多任务下载视频

小虎周

已于 2022-09-12 11:18:59 修改

阅读量2.8k

点赞数 2

文章标签： python 开发语言前端爬虫

于 2022-09-11 15:28:27 首次发布

本文链接：https://blog.csdn.net/zhou8622/article/details/126805695

版权

网上找个网站，视频手动一个一个下载，太麻烦了，怎么办？用某雷，out了
网页爬虫多线程下载视频步骤：

引入requests 访问网页内容，用正则解析提取url
分析html页得到mp4地址
把url存入线程安全的queue
多线程获到queue内的mp4地址，同时下载

import requests
import re
import os
import queue
import threading
import shutil

def download_start():
    download_url_queue = queue.Queue(3)
    mp4_code_set = set()
    page = 10
    store_location = '/Users/Downloads/.dyxx/'    #存储的地址
    download_site_home = "https://xxxxxx.com/"   #下载视的地址，这个需要你自己到网上发掘了
    mp4_api_url = 'https://api.xxxxxx.com/get-mp4-url?code='  #通过下载片源地址获取code, 通过code获得播放mp4的地址

    def download():
        while True:
            if not download_url_queue.empty():
                mp4_url = download_url_queue.get()
                try:
                    file_path = store_location + mp4_url[-15:]
                    if not os.path.exists(file_path):
                        print('Download start::::' + mp4_url)
                        res_header = requests.head(mp4_url)
                        if res_header.headers['Content-Type'] == 'video/mp4':
                            with open(file_path, "wb") as f, requests.get(mp4_url, stream=True) as res:
                                shutil.copyfileobj(res.raw, f)
                        print('Download end::::' + mp4_url)
                except Exception as ee:
                    print(str(ee))
                    pass

    for t in range(5):
        threading.Thread(target=download).start()

    while True:
        try:
            download_pages = download_site_home+'?page=' + str(page)
            res = requests.get(download_pages)
            if res.status_code == 200:
                re_href = re.compile(r'href="/\d{4}/[^"]*')
                all_href = re_href.findall(res.text)
                all_href.reverse()
                all_href_set = set(all_href[15:-15])
                for href_item in all_href_set:
                    play_page = download_site_home + href_item.replace('href="/', '')
                    play_page_res = requests.get(play_page)
                    if play_page_res.status_code == 200:
                        play_page_text = play_page_res.text
                        re_play_code = re.compile(r'data-code="[^"]*')
                        mp4_play_codes = re_play_code.findall(play_page_text)
                        mp4_play_codes_set = set(mp4_play_codes)
                        for code in mp4_play_codes_set:
                            param_code = code.replace('data-code="', '')
                            if param_code in mp4_code_set:
                                break
                            else:
                                mp4_code_set.add(param_code)
                            mp4url = mp4_api_url + param_code
                            mp4res = requests.get(mp4url)
                            if mp4res.status_code == 200:
                                file_path = store_location + mp4res.text[-15:]
                                if os.path.exists(file_path):
                                    break;
                                print(mp4res.text + '   ' + param_code)
                                download_url_queue.put(mp4res.text)
            page = page + 1
        except Exception as e:
            pass


if __name__ == '__main__':
    download_start()