python协程扒电影笔记(协程、m3u8、解密、合并)

南星叨叨

已于 2022-04-01 20:55:29 修改

阅读量1.5k

点赞数 2

分类专栏： # 爬虫文章标签： python

于 2022-01-20 15:26:20 首次发布

本文链接：https://blog.csdn.net/hans99812345/article/details/122601942

版权

爬虫专栏收录该内容

19 篇文章 2 订阅

订阅专栏

这篇文章干货很多

套路就是通过网站的第一个m3u8 找到第二个m3u8 第二个m3u8很重要这个里面记录了是否加密和所有的ts文件然后就是把所有的ts文件进行下载如果加密了就通过第二个m3u8提供的KEY 对ts进行解密每一个ts都要解密最后把解密后的ts 合并成一个mp4

包的安装

协程相关

pip install asyncio
pip install aiohttp
pip install aiofiles

解密相关

pip install pycryptodome
pip install crypto

需要注意的是 pycryptohome 这个东西在windows上得安装VC++14的包
这个就很坑，建议你安装一个vscode2019 然后安装
在这里插入图片描述

#!/usr/bin/python


import asyncio
import json
import os
import re
from urllib import parse

import aiofiles
import aiohttp
import requests
from Crypto.Cipher import AES
from bs4 import BeautifulSoup


def get_page_source(link):
    head = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36'
    }
    response = requests.get(link, headers=head)
    response.encoding = 'utf-8'
    return response.text


def parse_page_source(html_content, video_files):
    soup = BeautifulSoup(html_content, 'lxml')
    script = soup.find('div', attrs={'id': 'cms_player'}).find('script').text
    m3_1_data = script.split('=')[1].replace(';', '')
    m3_1_json_data = json.loads(m3_1_data)
    m3u8_one_url = m3_1_json_data['url']
    m3u8_one_page = get_page_source(m3u8_one_url)
    second_m3u8_url = ""
    for item in m3u8_one_page.split():
        if not item.startswith("#"):
            second_m3u8_url = parse.urljoin(m3u8_one_url, item)
            break
    second_m3u8 = get_page_source(second_m3u8_url)
    with open(video_files, mode="w") as f:
        f.write(second_m3u8)


def get_all_ts(file_path):
    ts_url_list = []
    with open(file_path, 'r', encoding='utf-8') as file_object:
        lines = file_object.readlines()
        for line in lines:
            new_line = line.strip()
            if not new_line.startswith('#'):
                ts_url_list.append(new_line)
    return ts_url_list

"""
协程下载
这个里面叫爬虫自省
去下载ts,for循环+try+协程 失败了再去重新下载,这个套路还是得好好学学一下
"""
async def aio_download_ts(save_path, ts_url, session):
    for c in range(10):
        try:
            async with session.get(ts_url) as resp:
                movie_content = await resp.content.read()
                # 存储文件
                async with aiofiles.open(save_path, mode="wb") as f:
                    await f.write(movie_content)
            print(save_path, "下载完毕!~")
            return ""
        except:
            print(ts_url, "下载失败!~, 重新下载. ")
    return ts_url


async def aio_download(name, movie_file_list):
    tasks = []
    file_path = f"./{name}"
    if not os.path.exists(file_path):
        os.makedirs(file_path)
    async with aiohttp.ClientSession() as session:
        for ts_url in movie_file_list:
            file_name = ts_url.split("/")[-1]
            movie_save_path = os.path.join(file_path, file_name)
            tasks.append(asyncio.create_task(aio_download_ts(movie_save_path, ts_url, session)))
        # 启动多任务异步下载
        result, pending = await asyncio.wait(tasks)
        # 如果result里有东西. 那就坏菜了.  这里可以考虑让程序休息一会儿. 然后重新下载.
        # 或者直接记录在文件里. 等以后再下载.

"""
从第二个m3u8获取key
"""
def get_key(second_m3u8):
    with open(second_m3u8, 'r', encoding='utf-8') as file_object:
        files = file_object.read()
        obj = re.compile(r'URI="(?P<key_url>.*?)"')
        result = obj.search(files)
        key_url = result.group("key_url")
        return get_page_source(key_url)

"""
在服务器做协程解密,估计是性能不行,我就写了个下面的单线程的解密
"""
async def aio_decrypt_ts(file_path, new_file_path, key):
    async with aiofiles.open(file_path, mode="rb") as f1, \
            aiofiles.open(new_file_path, mode="wb") as f2:
        content = await f1.read()
        aes = AES.new(key.encode("utf-8"), IV=b"0000000000000000", mode=AES.MODE_CBC)
        decrypt_content = aes.decrypt(content)
        await f2.write(decrypt_content)
    print(f"解密成功, 文件被存放在{new_file_path}")


async def aio_decrypt(name, decrypt_name, ts_url_list, key):
    file_path_dir = f"./{name}"
    new_file_path_dir = f"./{decrypt_name}"
    # new_file_path_dir = f"./{movie_name}/temp"
    if not os.path.exists(new_file_path_dir):
        os.makedirs(new_file_path_dir)
    tasks = []
    for ts_url in ts_url_list:
        ts_name = ts_url.split("/")[-1]
        file_path = os.path.join(file_path_dir, ts_name)
        new_file_path = os.path.join(new_file_path_dir, ts_name)
        tasks.append(asyncio.create_task(aio_decrypt_ts(file_path, new_file_path, key)))
    result = await asyncio.gather(*tasks)
    return result

"""
解密这个事
网站是aes-128进行加密的
aes = AES.new(key.encode("utf-8"), IV=b"0000000000000000", mode=AES.MODE_CBC)
IV 网站上没有，如果有的话，用网站的,所有我这里就16个0
网上还有个m3u8的包,这个我没研究过
"""
def decrypt(name, decrypt_name, ts_url_list, key):
    file_path_dir = f"{name}"
    new_file_path_dir = f"{decrypt_name}"
    if not os.path.exists(new_file_path_dir):
        os.makedirs(new_file_path_dir)
    for files in ts_url_list:
        ts_name = files.split("/")[-1]
        file_path = os.path.join(file_path_dir, ts_name)
        new_file_path = os.path.join(new_file_path_dir, ts_name)
        with open(file_path, 'rb') as f1:
            content = f1.read()
            with open(new_file_path, 'wb') as f2:
                aes = AES.new(key.encode("utf-8"), IV=b"0000000000000000", mode=AES.MODE_CBC)
                decrypt_content = aes.decrypt(content)
                f2.write(decrypt_content)
    print(f"解密成功, 文件被存放在{new_file_path_dir}")

"""
最后的合并,就是50个，50个的合并，形成一个大的ts，再把这个大的ts进行合并，调用了linux 系统的cat 多个 重定向到一个的 cat 1 2 3 > 4.mp4
windows是 copy /b  整个的.mp4
还有一个方法ffmpg 这个我没用过，
"""
def merge(name, new_file_path_dir, ts_url_list):
    # 进入到该文件夹内
    cwd = os.getcwd()
    os.chdir(new_file_path_dir)
    # 合并, 每50个合并为1个.
    part = 1
    last = []
    ts_list = []
    for i in range(len(ts_url_list)):
        ts_url = ts_url_list[i]

        ts_name = ts_url.split("/")[-1]
        ts_list.append(ts_name)

        if i != 0 and i % 50 == 0:
            # "cat  {names} > movie.mp4"
            os.popen(f"cat  {' '.join(ts_list)} > big_movie_{part}.ts")
            last.append(f"big_movie_{part}.ts")
            part += 1
            ts_list = []
    # 最后的最后还剩下一些没有合并呢
    os.popen(f"cat  {' '.join(ts_list)} > big_movie_{part}.ts")
    last.append(f"big_movie_{part}.ts")

    os.popen(f"cat  {' '.join(last)} > %s.mp4" % name)
    os.chdir(cwd)


if __name__ == '__main__':
    movie_name = 'BlackWidow'
    movie_name_decrypt = 'New_BlackWidow'
    url = 'https://www.tudouyy.com/video/dongzuo/177708/2-1.html'
    second_m3u8_file = 'second_m3u8.txt'
    print(f'开干...电影名称:{movie_name},开始解析地址:{url}')
    source = get_page_source(url)
    parse_page_source(source, second_m3u8_file)
    ts_list = get_all_ts(second_m3u8_file)
    asyncio.run(aio_download(movie_name, ts_list))
    print('所有文件下载完成...准备进行解密工作...')
    movie_key = get_key(second_m3u8_file)
    """
    在服务器去协程解密,代码没毛病,总报错,就用普通的解密方式吧
    """
    decrypt(movie_name, movie_name_decrypt, ts_list, movie_key)
    print(f'还有最后一步,把所有文件合并成电影:{movie_name}')
    merge(movie_name, movie_name_decrypt, ts_list)
    print('可以看电影了...')