使用python下载简单的m3u8视频

使用python下载简单的m3u8视频

流程

获取index.m3u8
获取每个视频段的链接
下载并解密视频段
合并视频段

准备一些数据

import re
import requests

# 获取本类的url字段
@property
def url(self):
    return self._url

# 获取m3u8内容
def _get_m3u8_content(self):
    if self._m3u8_content is None:
        headers = {
        # 需要修改
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
        }
        page = requests.get(self._url, headers=headers)
        if page is None:
            raise RuntimeError("can't get url's content")
        self._m3u8_content = page.text
    return self._m3u8_content

# 获取链接的头,也就是去除xxx.m3u8后缀
def _get_head_url(self):
    if self._head_url is None:
        find = re.findall(r'(.*/).*\.m3u8?', self._url)
        if find:
            self._head_url = find[0]
        else:
            raise RuntimeError("can't get head url")
    return self._head_url

# 返回链接列表
def _get_urls(self):
    if self._urls is None:
        urls = re.findall(r'(h.*\.ts)', self._get_m3u8_content())
        if urls:
            # 如果ts不是完整的链接,需要补上head_url
            if not (re.match(r'^http', urls[0]) or re.match(r'^https', urls[0])):
                head_url = self._get_head_url()
                urls = list(map(lambda x: head_url + x, urls))
        else:
            raise RuntimeError("can't find urls")
        self._urls = urls
    return self._urls

解密

# 获取是否加密 注意:这里默认为AES的ECB或CBC加密,如果是其他方式,或者不适用,则需要另行修改
def _get_is_encrypted(self):
    if self._is_encrypted is None:
        if re.match(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content()):
            self._is_encrypted = True
        else:
            self._is_encrypted = False
    return self._is_encrypted

# 获取m3u8文件中,关于加密信息对应的那一行
def _get_encrypted_line(self):
    if self._encrypted_line is None:
        find = re.findall(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content())
        if find:
            self._encrypted_line = find[0]
        else:
            raise RuntimeError("can't get encrypted line")
    return self._encrypted_line

# 获取加密的密匙 未考虑需要填充成16字节128位的倍数
def _get_encrypted_key(self):
    if self._encrypted_key is None:
        # key密匙 注意:如果不是以y结尾需要修改
        find = re.findall(r'URI="?(.*y)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
        head_url = self._get_head_url()
        key = None
        if find:
            key = find[0]
            # 如果不是完整链接,补充上head_url
            if re.match(head_url, find) is None:
                key = head_url + key
        else:
            # 如果在m3u8内容中没有找到密匙,试试默认普遍的url规则
            key = head_url + 'key.key'
        req = requests.get(key)
        if req:
            self._encrypted_key = req.content
        else:
            raise RuntimeError("can't get encrypted key")
    return self._encrypted_key

# 获取加密的偏移值 未考虑填充
def _get_encrypted_iv(self):
    if self._encrypted_iv is None:
        find = re.findall(r'IV="?(\w*)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
        if find:
            self._encrypted_iv = find[0]
        else:
            raise RuntimeError("can't get encrypted iv")
    return self._encrypted_iv
    
# 按照aes(ecb或cbc)解密
def _get_decrypt_content(self, content, key, iv=None):
    aes = None
    if iv is None:
        aes = AES.new(key, AES.MODE_ECB)
    else:
        aes = AES.new(key, AES.MODE_CBC, iv)

    content = aes.decrypt(content)
    return content

# 进行解密处理
def _decrypt_content(self, content):
    # 如果加密了就将其解密
    if self._get_is_encrypted():
        key = self._encrypted_key()
        iv = self._encrypted_iv()
        content = self._get_decrypt_content(content, key, iv)
    return content

多进程异步下载

  • 将程序分为三个部分,异步下载段视频、合并段视频、写入到文件
  • 使用进程是为了利用多核cpu,每个进程负责一个部分

进程间共用的数据

            self._data = m.dict({
                # 视频段数据列表
                'get': [
                    # {
                    #     # 开始下标
                    #     'start': 1,
                    #     # 结束下标
                    #     'end': 1,
                    #     # 视频段二进制内容
                    #     'content': b'xxxx',
                    # }
                ],
                # 视频段总数量
                'max_count': len(self._get_urls()),
                # 当前写入磁盘文件的数量
                'write_count': None,
            })

异步下载段视频部分

    # 获取异步response,利用递归出错时重复执行
    async def _get_rep(self, session, segment_url, count):
        if count <= 0:
            return None
        count -= 1
        try:
            async with session.get(segment_url) as rep:
                return rep
        except RuntimeError:
            return self._get_rep(session, segment_url, count)

    # 显示当前下载进度条
    def _process_bar(self, cur, end):
        print("\r", end='')
        print("download file ({}) {}%:".format((str(cur) + '/' + str(end)), int((cur / end) * 100)),
              '▋' * int((cur / end) * 100),
              end='')
        sys.stdout.flush()

    # 异步请求 依赖aiohttp
    async def _async_get_segment_content(self, segment_url, semaphore, index):
        async with semaphore:
            async with aiohttp.ClientSession() as session:
                # 如果出错试图重连3次
                # rep = self._get_rep(session, segment_url, 3)
                try:
                    async with session.get(segment_url) as rep:
                        if rep:
                            content = await rep.read()
                            content = self._decrypt_content(content)
                            with self._lock:
                                # Manager.dict的深层数据无法直接修改,只能通过中间变量来改动
                                data = self._data['get']
                                data.append(
                                    {
                                        'start': index,
                                        'end': index,
                                        'content': content
                                    }
                                )
                                self._data['get'] = data
                except (Exception, RuntimeError):
                    # 出错后将空数据插入
                    with self._lock:
                        data = self._data['get']
                        data.append(
                            {
                                'start': index,
                                'end': index,
                                'content': b''
                            }
                        )
                        self._data['get'] = data
                # 打印进度条
                self._process_bar(index + 1, self._data['max_count'])
                return content

    # 获取段视频数据
    def _get_all_contents(self):
        loop = asyncio.new_event_loop()
        semaphore = asyncio.Semaphore(self._run_count)
        urls = self._get_urls()
        tasks = []
        # 很想写成列表生成式 但是报错
        for k, v in enumerate(urls):
            task = asyncio.ensure_future(self._async_get_segment_content(v, semaphore, k), loop=loop)
            tasks.append(task)
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()
  • 遗憾:
    1. 当session.get发生错误时没有修改成重连几次
    2. 无法用列表生成式生成任务
    3. aiohttp.ClientSession()返回的变量没有复用

合并段视频部分

    # 合并视频段
    def _merge_segment_content(self):
        while True:
            if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
                break
            if len(self._data['get']) <= 1:
                time.sleep(2)
                continue
            # 必须在遍历前获取锁,防止在遍历的时候数据被修改了
            with self._lock:
                for k, v in enumerate(self._data['get']):
                    for i, j in enumerate(self._data['get']):
                        # 去除自己
                        if k == i:
                            continue
                        if v['end'] + 1 == j['start']:
                            get_data = self._data['get']
                            get_data[k]['content'] += get_data[i]['content']
                            get_data[k]['end'] = get_data[i]['end']
                            get_data.pop(i)
                            #print('\n合并视频-'+str(v['start'])+' << '+str(j['start']))
                            self._data['get'] = get_data
                            break
                    # 双层循环退出
                    else:
                        continue
                    break
            time.sleep(1)

写入到文件部分

    # 将段视频内容写入
    def _write_segment_content(self):
        while True:
            if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
                break
            if len(self._data['get']) <= 0:
                continue
            content = None
            with self._lock:
            # 每次只写入一个,因为需要修改正在遍历的数据,防止出错
                for k, v in enumerate(self._data['get']):
                    if (self._data['write_count'] is None) or (self._data['write_count'] + 1 == v['start']):
                        content = v['content']
                        self._data['write_count'] = v['end']
                        get_list = self._data['get']
                        get_list.pop(k)
                        #print('\n写入视频-' + str(v['start']))
                        self._data['get'] = get_list
                        break
            # 将写入数据放到锁外面执行,不要占用锁
            if content:
                with open(os.path.join(self._save_file_dir, self._save_file_name), 'ab') as f:
                    f.write(content)
            time.sleep(0.05)

全部代码

  • 注意: 关于请求头部分需要修改成自己的
import asyncio
import os.path
import re
import sys
import time
from multiprocessing import Lock, Manager, Process

import aiohttp
import requests
from Crypto.Cipher import AES


class M3u8(object):
    """
    初始化M3u8对象

    参数说明:
    ------------------------
    m3u8_url : str
        m3u8的链接
    run_count : int
        同一时间内最多请求的数量
    save_file_dir : str
        保存文件的目录
    save_file_name : str
        保存文件的名称
    ------------------------
    """
    def __init__(self, m3u8_url, run_count, save_file_dir=None, save_file_name=None) -> None:
        self._m3u8_url = m3u8_url
        self._save_file_dir = save_file_dir
        self._save_file_name = save_file_name
        self._run_count = run_count

        self._m3u8_content = None
        self._head_url = None
        self._is_encrypted = None
        self._encrypted_line = None
        self._encrypted_key = None
        self._encrypted_iv = None
        self._urls = None

        if save_file_dir is None:
            save_file_dir = './'
        if save_file_name is None:
            save_file_name = 'mv.mp4'
        if not os.path.exists(save_file_dir):
            os.mkdir(save_file_dir)
        # 注意: 这里会删除下载位置的重名文件
        if os.path.exists(os.path.join(save_file_dir, save_file_name)):
            os.remove(os.path.join(save_file_dir, save_file_name))

    # 获取m3u8内容
    def _get_m3u8_content(self):
        if self._m3u8_content is None:
        # 注意: 需要修改成自己的
            headers = {
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) 
            }
            page = requests.get(self._m3u8_url, headers=headers)
            if page is None:
                raise RuntimeError("can't get url's content")
            self._m3u8_content = page.text
        return self._m3u8_content

    # 获取链接的头,也就是去除xxx.m3u8后缀
    def _get_head_url(self):
        if self._head_url is None:
            find = re.findall(r'(.*/).*\.m3u8?', self._m3u8_url)
            if find:
                self._head_url = find[0]
            else:
                raise RuntimeError("can't get head url")
        return self._head_url

    # 获取是否加密 注意:这里默认为AES的ECB或CBC加密,如果是其他方式,或者不适用,则需要另行修改
    def _get_is_encrypted(self):
        if self._is_encrypted is None:
            if re.match(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content()):
                self._is_encrypted = True
            else:
                self._is_encrypted = False
        return self._is_encrypted

    # 获取m3u8文件中,关于加密信息对应的那一行
    def _get_encrypted_line(self):
        if self._encrypted_line is None:
            find = re.findall(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content())
            if find:
                self._encrypted_line = find[0]
            else:
                raise RuntimeError("can't get encrypted line")
        return self._encrypted_line

    # 获取加密的密匙 未考虑需要填充成16字节128位的倍数
    def _get_encrypted_key(self):
        if self._encrypted_key is None:
            # key密匙 注意:如果不是以y结尾需要修改
            find = re.findall(r'URI="?(.*y)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
            head_url = self._get_head_url()
            key = None
            if find:
                key = find[0]
                # 如果不是完整链接,补充上head_url
                if re.match(head_url, find) is None:
                    key = head_url + key
            else:
                # 如果在m3u8内容中没有找到密匙,试试默认普遍的url规则
                key = head_url + 'key.key'
            req = requests.get(key)
            if req:
                self._encrypted_key = req.content
            else:
                raise RuntimeError("can't get encrypted key")
        return self._encrypted_key

    # 获取加密的偏移值 未考虑填充
    def _get_encrypted_iv(self):
        if self._encrypted_iv is None:
            find = re.findall(r'IV="?(\w*)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
            if find:
                self._encrypted_iv = find[0]
            else:
                raise RuntimeError("can't get encrypted iv")
        return self._encrypted_iv

    # 按照aes(ecb或cbc)解密
    def _get_decrypt_content(self, content, key, iv=None):
        aes = None
        if iv is None:
            aes = AES.new(key, AES.MODE_ECB)
        else:
            aes = AES.new(key, AES.MODE_CBC, iv)

        content = aes.decrypt(content)
        return content

    # 进行解密处理
    def _decrypt_content(self, content):
        # 如果加密了就将其解密
        if self._get_is_encrypted():
            key = self._encrypted_key()
            iv = self._encrypted_iv()
            content = self._get_decrypt_content(content, key, iv)
        return content

    # 返回链接列表
    def _get_urls(self):
        if self._urls is None:
            urls = re.findall(r'(h.*\.ts)', self._get_m3u8_content())
            if urls:
                # 如果ts不是完整的链接,需要补上head_url
                if not (re.match(r'^http', urls[0]) or re.match(r'^https', urls[0])):
                    head_url = self._get_head_url()
                    urls = list(map(lambda x: head_url + x, urls))
            else:
                raise RuntimeError("can't find urls")
            self._urls = urls
        return self._urls

    # 显示当前下载进度条
    def _process_bar(self, cur, end):
        print("\r", end='')
        print("download file ({}) {}%:".format((str(cur) + '/' + str(end)), int((cur / end) * 100)),
              '▋' * int((cur / end) * 100),
              end='')
        sys.stdout.flush()

    # 异步请求 依赖aiohttp
    async def _async_get_segment_content(self, segment_url, semaphore, index):
        async with semaphore:
            async with aiohttp.ClientSession() as session:
                try:
                    async with session.get(segment_url) as rep:
                        if rep:
                            content = await rep.read()
                            content = self._decrypt_content(content)
                            with self._lock:
                                # Manager.dict的深层数据无法直接修改,只能通过中间变量来改动
                                data = self._data['get']
                                data.append(
                                    {
                                        'start': index,
                                        'end': index,
                                        'content': content
                                    }
                                )
                                self._data['get'] = data
                except (Exception, RuntimeError):
                    # 出错后将空数据插入
                    with self._lock:
                        data = self._data['get']
                        data.append(
                            {
                                'start': index,
                                'end': index,
                                'content': b''
                            }
                        )
                        self._data['get'] = data
                # 打印进度条
                self._process_bar(index + 1, self._data['max_count'])
                return content

    # 获取段视频数据
    def _get_all_contents(self):
        loop = asyncio.new_event_loop()
        semaphore = asyncio.Semaphore(self._run_count)
        urls = self._get_urls()
        tasks = []
        # 很想写成列表生成式 但是报错
        for k, v in enumerate(urls):
            task = asyncio.ensure_future(self._async_get_segment_content(v, semaphore, k), loop=loop)
            tasks.append(task)
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()

    # 合并视频段
    def _merge_segment_content(self):
        while True:
            if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
                break
            if len(self._data['get']) <= 1:
                time.sleep(2)
                continue
            # 必须在遍历前获取锁,防止在遍历的时候数据被修改了
            with self._lock:
                for k, v in enumerate(self._data['get']):
                    for i, j in enumerate(self._data['get']):
                        # 去除自己
                        if k == i:
                            continue
                        if v['end'] + 1 == j['start']:
                            get_data = self._data['get']
                            get_data[k]['content'] += get_data[i]['content']
                            get_data[k]['end'] = get_data[i]['end']
                            get_data.pop(i)
                            print('\n合并视频-'+str(v['start'])+'<<'+str(j['start']))
                            self._data['get'] = get_data
                            break
                    # 双层循环退出
                    else:
                        continue
                    break
            time.sleep(1)

    # 将段视频内容写入
    def _write_segment_content(self):
        while True:
            if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
                break
            if len(self._data['get']) <= 0:
                continue
            content = None
            with self._lock:
                for k, v in enumerate(self._data['get']):
                    if (self._data['write_count'] is None) or (self._data['write_count'] + 1 == v['start']):
                        content = v['content']
                        self._data['write_count'] = v['end']
                        get_list = self._data['get']
                        get_list.pop(k)
                        print('\n写入视频-' + str(v['start']))
                        self._data['get'] = get_list
                        break
            # 将写入数据放到锁外面执行,不要占用锁
            if content:
                with open(os.path.join(self._save_file_dir, self._save_file_name), 'ab') as f:
                    f.write(content)
            time.sleep(0.05)

    # 运行
    def run(self):
        self._lock = Lock()
        with Manager() as m:
            self._data = m.dict({
                # 视频段数据列表
                'get': [],
                # 视频段数量
                'max_count': len(self._get_urls()),
                # 当前写入磁盘数量
                'write_count': None,
            })
            # 很想使用进程池,但是无法运行
            # pool = Pool(3)
            # pool.apply_async(self._get_all_contents, args=(self,))
            # pool.apply_async(self._merge_segment_content, args=(self,))
            # pool.apply_async(self._write_segment_content, args=(self,))
            # pool.close()
            # pool.join()
            tasks = (self._get_all_contents, self._merge_segment_content, self._write_segment_content)
            processes = [Process(target=v) for v in tasks]
            for p in processes:
                p.start()
            for p in processes:
                p.join()


if __name__ == '__main__':
    url = r'xxx/index.m3u8'
    m3u8 = M3u8(url, 3, './mv', 'mv.mp4')
    m3u8.run()
  • 遗憾:
    1. 无法使用进程池
    2. 无法直接修改进程共享变量的最深层数据

补充

同步请求

    # 下载单个视频
    def _download_single_mv(self, url):
        req = requests.get(url)
        content = None
        if req:
            content = req.content
        else:
            raise RuntimeError(f"can't download sigle mv, url : {url}")
        # 解密
        content = self._decrypt_content(content)
        return content
    # 将每个ts下载下来,并写入到磁盘文件
    def _download_all_ts(self, save_file_dir):
        urls = self._get_urls()
        for k, url in enumerate(urls):
            content = self._download_single_mv(url)
            with open(save_file_dir + '/0' + str(k) + '.ts', 'wb') as f:
                f.write(content)
                
    # 将每个ts下载后直接合并
    def _download_all_ts_and_merge(self, save_file_dir, save_file_name):
        urls = self._get_urls()
        with open(save_file_dir + '/' + save_file_name, 'ab') as f:
            for url in urls:
                content = self._download_single_mv(url)
                f.write(content)

异步请求

    # 将单个段视频写入磁盘
    def _async_write_file(self, content, index):
        _save_file_dir = self._save_file_dir
        with open(_save_file_dir + '/0' + str(index) + '.ts', 'wb') as f:
            f.write(content)

    # 异步请求 依赖aiohttp
    async def _async_get(self, url, semaphore, index):
        # print('index: ' + str(index) + ' url: ' + url)
        async with semaphore:
            async with aiohttp.ClientSession() as session:
                async with session.get(url) as rep:
                    content = await rep.read()
                    content = self._decrypt_content(content)
                    self._async_write_file(content, index)
                    return content

    # 异步执行
    def _async_run(self, count):
        # loop = asyncio.get_event_loop()
        loop = asyncio.new_event_loop()
        # asyncio.set_event_loop(loop)
        semaphore = asyncio.Semaphore(count)
        tasks = []
        urls = self._get_urls()
        for k, url in enumerate(urls):
            task = asyncio.ensure_future(self._async_get(url, semaphore, k), loop=loop)
            # task = asyncio.create_task(self._async_get(url, semaphore, k))
            tasks.append(task)
        # tasks = [asyncio.ensure_future(self._async_get(url, semaphore, k) for k, url in enumerate(self._get_urls()))]
        loop.run_until_complete(asyncio.wait(tasks))

合并

  • 合并的4种方式
# 合并后删除ts文件
def _del_all_ts_files(self, del_files_dir):
    files = [x for x in os.listdir(del_files_dir) if os.path.isfile(del_files_dir + '/' + x) and os.path.splitext(del_files_dir + '/' + x)[1] == '.ts']
    for file in files:
        os.remove(del_files_dir + '/' + file)
  1. zipfile
# zipfile 合并文件 不使用压缩
def _zipfile_merge(self, save_file_dir, save_file_name, is_del=True):
    # 可能需要处理下合并顺序
    files = os.listdir(save_file_dir)
    with zipfile.ZipFile(os.path.join(save_file_dir, save_file_name), 'a') as z:
        for file in files:
            z.write(save_file_dir + '/' + file)
    if is_del is True:
        self._del_all_ts_files(save_file_dir)
  1. copy 命令 合并文件
# copy 命令 合并文件
def _copy_command_merge(self, save_file_dir, save_file_name, is_del=True):
    file_name = save_file_name
    cur_dir = os.path.abspath('.')
    os.chdir(save_file_dir)
    # 可能会因为顺序导致出问题
    os.system("copy /b *.ts new.tmp")
    os.rename("new.tmp", file_name)
    # 是否删除文件
    if is_del is True:
        # os.system("del /Q *.ts")
        os.chdir(cur_dir)
        self._del_all_ts_files(save_file_dir)
  1. ffmpeg 合并文件
# ffmpeg 合并  未测试过不确定是否正确
def _ffmpeg_merge(self, save_file_dir, save_file_name, is_del=True):
    # 准备好index.m3u8文件
    m3u8_content = self._get_m3u8_content()
    # 删除加密行
    m3u8_content = re.sub(r'(#EXT-X-KEY:METHOD.*\n)', '', m3u8_content)
    # 将所有视频段替换成本地ts文件链接
    urls = re.findall(r'(h.*\.ts)', m3u8_content)
    for k, v in enumerate(urls):
        m3u8_content = re.sub(v, k, m3u8_content, count=1)
    # 保存当前目录
    cur_dir = os.path.abspath('.')
    # 切换到保存文件目录
    os.chdir(save_file_dir)
    # 将m3u8写入文件
    with open('index.m3u8', 'wb') as f:
        f.write(m3u8_content)
    # 执行合并命令
    os.system("ffmpeg -i index.m3u8 -c copy " + save_file_name)
    # 切回目录
    os.chdir(cur_dir)
    # 是否删除ts文件
    if is_del is True:
        self._del_all_ts_files(save_file_dir)
  1. 下载段视频到内存后直接追加到磁盘中的视频文件,不使用临时.ts文件

后记

把整个视频下载下来然后再进行解密比较好

后来又写了段,时间为 2022-12-14 17:18,写到网易云搜索字符加密然后发送请求这一步不想写了,就放弃了。现在又全都忘得差不多了,只留下了不知道什么鬼样的代码,等以后兴趣再来了再去弄把。

#!/usr/bin/env python3
# 仅用于学习参考,请体谅免费资源提供网站,不要发送大量请求
# 需要下载的依赖包
# python -m pip install fake-useragent lxml requests aiohttp
import argparse
import asyncio
import json
import os
import random
import re
import sys
import time
from enum import Enum
from functools import reduce
from pprint import pprint
from urllib.parse import quote, urlparse, urlsplit
from multiprocessing import Lock, Manager, Process
from Crypto.Cipher import AES

import aiohttp
import requests
from requests.exceptions import ReadTimeout
from fake_useragent import UserAgent, UserAgentError
from lxml import etree


class spider_tools(object):
    def __init__(self):
        option = Enum('spider_option', ('bayizww', 'neteasemusic', 'hanTvn'))
        hint = {option.bayizww: '八一中文网 小说', option.neteasemusic: '网易云音乐 音乐', option.hanTvn: '韩剧Tvn 视频'}
        option_str = reduce(lambda x, y: x + y, ['\n  [' + mem.name + '] ' + hint[mem] + '、' for mem in option])
        parse = argparse.ArgumentParser()
        parse.add_argument('spider_option', help='选择一个下载选项:' + option_str, type=str)
        parse.add_argument('search_str', help='搜索的内容', type=str)
        parse.add_argument('-d', '--save_dir', type=str, help='存储的位置', default='./')
        parse.add_argument('-c', '--link_count', type=int, help='同时连接的数量', default=1)
        if len(sys.argv) >= 2:
            spider_option = sys.argv[1]
        else:
            print('选择一个下载选项:' + option_str)
            return
        if spider_option == option.bayizww.name:
            parse_args = parse.parse_args(sys.argv[1:])
            spider_81txt(parse_args)
        elif spider_option == option.neteasemusic.name:
            # 添加更多参数 解析 然后将解析的参数传入
            parse_args = parse.parse_args(sys.argv[1:])
        elif spider_option == option.hanTvn.name:
            parse_args = parse.parse_args(sys.argv[1:])
            spider_hanTvn_mv(parse_args)


ua = UserAgent()


class spider_common(object):

    @staticmethod
    def _headers():
        try:
            return {'user-agent': ua[random.choice(ua.browsers)]}
        except UserAgentError as e:
            return {
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.62'
            }

    # 显示当前下载进度条
    @staticmethod
    def _process_bar(cur, end):
        print("\r", end='')
        print("download file ({}) {}%:".format((str(cur) + '/' + str(end)), int((cur / end) * 100)),
              '▋' * int((cur / end) * 100),
              end='')
        sys.stdout.flush()

    async def _async_download_file(self, url, link_count, func, cur, end, headers=_headers()):
        async with link_count:
            async with aiohttp.ClientSession() as session:
                try:
                    async with session.get(url, headers=headers) as rep:
                        content = await rep.read()
                        if func:
                            self._process_bar(cur, end)
                            return func(content)
                        else:
                            raise Exception('不存在执行方法func')
                except (Exception, RuntimeError) as e:
                    print('\n获取' + url + '内容失败,请选择处理方式:')
                    print(e)
                    print('0: 再次重新获取\n1:丢失此内容')
                    get_option = self._input_select_index('请选择处理方式', 0, 2)
                    if get_option == 1:
                        return self._async_download_file(url, link_count, func, cur, end)
                    else:
                        return self._download_error_return

    # 虽然是精神病但没关系16.mp4 -> 虽然是精神病但没关系
    _del_url_episode_reg = re.compile(r'第?\w+\.mp4')

    def _sync_download_mv(self, url, save_dir, chunk_size):
        url_base_name = os.path.basename(url)
        url = url.replace(url_base_name, quote(url_base_name))

        tmp_count = 1
        sleep_time = 3.5
        while tmp_count < 4:
            try:
                response = requests.get(url=url, stream=True, timeout=1, headers=self._headers())
                if response.status_code != 200:
                    print('获取下载视频响应失败,错误码:' + str(response.status_code))
                    raise ReadTimeout
                else:
                    break
            except ReadTimeout:
                print('获取下载视频响应超时,正在尝试重连')
                time.sleep(sleep_time)
                tmp_count += 1
        print('总大小:' + str(int(response.headers['Content-Length']) / 1024 / 1024) + 'M')
        print('每次读取大小: ' + str(chunk_size / 1024 / 1024) + 'M')
        if not response or response.status_code != 200:
            print('无法获取下载视频响应,正在退出')
            if response and response.close:
                response.close()
            return

        save_dir = os.path.join(save_dir, self._del_url_episode_reg.sub('', url_base_name))
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        save_path = os.path.join(save_dir, url_base_name)
        if os.path.exists(save_path):
            print('文件已存在:' + save_path)
            return
        with open(save_path, mode='wb') as f:
            print('开始下载:' + url_base_name)
            for chunk in response.iter_content(chunk_size):
                if chunk:
                    self._process_bar(os.path.getsize(save_path), int(response.headers['Content-Length']))
                    f.write(chunk)
                if os.path.getsize(save_path) > 10 * 1024 * 1024:
                    return
        response.close()

    # 获取能够被xpath的对象
    @staticmethod
    def _get_xpath_html(url, headers=_headers(), delay_time=None, timeout=1):
        print('timeout:' + str(timeout))
        if delay_time:
            time.sleep(delay_time)
        tmp_count = 1
        sleep_time = 3.5
        while tmp_count < 4:
            try:
                with requests.get(url, headers=headers, timeout=timeout) as response:
                    if response.status_code == 200:
                        return etree.HTML(response.content, parser=etree.HTMLParser(encoding='utf-8'))
                    else:
                        print('访问失败,错误码为:' + str(response.status_code))
                        raise ReadTimeout
            except ReadTimeout:
                print('访问超时,正在尝试第' + str(tmp_count) + '次重连' + ',等待' + str(sleep_time) + '后开始发送请求')
                time.sleep(sleep_time)
                tmp_count += 1
        raise Exception('尝试重连' + str(tmp_count - 1) + '次后,也无法获取到响应')
        return

    @staticmethod
    def _input_select_index(show_text, start, end):
        while True:
            try:
                input_str = input(show_text + ':')
                if input_str == 'exit' or input_str == '^C':
                    return 'exit'
                get_index = int(input_str)
                if not (get_index in range(start, end)):
                    raise ValueError('请输入在{' + str(start) + ',' + str(end - 1) + '}范围内的值')
                else:
                    return get_index
            except (KeyboardInterrupt, ValueError) as e:
                print('\n错误:请输入正确的数字')
                print(e)


class spider_81txt(spider_common):
    # 搜索书籍,并返回搜索的结果
    def _search_books_by_name(self, search_str):
        if search_str is None:
            raise Exception("search_str is None")
        # 搜索暂不支持翻页
        url = r'https://www.81zw.com/search.php?keyword='
        # 将中文转换为url编码
        # [Python使汉字转换成url可识别的编码](https://blog.csdn.net/qq_33267306/article/details/121677064#:~:text=Python使汉字转换成url可识别的编码%201%20import%20urllib.parse%202%20t%20%3D%20'爬虫工程师',(s)%20%23将url可识别的编码转换成汉字%205%20print%20(s)%206%20print%20(f))
        search_url = url + quote(search_str)
        search_result_books = self._get_xpath_html(search_url).xpath(
            '//div[@class="result-list gameblock-result-list"]/div')
        books_info = []
        for k in search_result_books:
            info_div = k.xpath('.//div[@class="result-game-item-detail"]')[0]
            # 标题
            book_title = info_div.xpath('.//h3/a/text()')[0].strip()
            # 网址
            book_url = info_div.xpath('.//h3/a/@href')[0]
            book_url = re.sub('/search.*', '', url) + book_url
            # 简介
            synopsis = info_div.xpath('.//p[@class="result-game-item-desc"]/text()')[0]
            # 其他信息
            book_other_info = info_div.xpath('.//div[@class="result-game-item-info"]/p')
            # 作者
            book_author = book_other_info[0].xpath('.//span')[1].xpath('.//text()')[0]
            # 类型
            book_type = book_other_info[1].xpath('.//span')[1].xpath('.//text()')[0]
            # 更新时间
            book_update_time = book_other_info[2].xpath('.//span')[1].xpath('.//text()')[0]
            # 最新章节
            book_latest_chapters = book_other_info[3].xpath('.//a/text()')[0]
            books_info.append({
                "title": book_title,
                "url": book_url,
                "synopsis": synopsis,
                "author": book_author,
                "type": book_type,
                "update_time": book_update_time,
                "latest_chapters": book_latest_chapters,
            })
        return books_info

    # 获取书籍的名字和所有章节的链接
    def _get_book_info(self, html):
        title = html.xpath("//div[@id='info']/h1/text()")[0].strip()
        # book/61100/256885.html >  256885.html  只留下最后面的网址
        chapters = list(map(lambda x: re.sub(r'/\w+/\d+/', '', x), html.xpath("//div[@id='list']/dl/dd/a/@href")))
        return {"title": title, "chapters": chapters}

    _book_content_regexp_compiled_str = re.compile(r'(([8|八|⑧][1|(一|壹)|①])中文[网|網])|'
                                                   r'(([m|(www)]\.)*([8|八|⑧][1|(一|壹)|①])([z|Z][w|W])\.([ć|c]|[ő|o]|m))|'
                                                   r'(网页版章节内容慢,请下载爱阅小说app阅读最新内容(.|\n|\r)*)|'
                                                   r'(本章未完,点击\[下一页\]继续阅读-->>[\n|\r].*)|'
                                                   r'(.*?推荐一本书[.|\n|\r]*)|'
                                                   r'(\?此章节正在\?ww\.om努力更新ing,请稍后刷新访问[.|\n|\r]*)'
                                                   )

    # 获取第n章的标题及本章的内容
    def _get_chapter_info(self, html):
        # 获取章节标题 strip删除空格换行(可能没有空格换行啥的需要删 不过无所谓)
        chapter_title = html.xpath("//div[@class='bookname']/h1/text()")[0].strip()

        # 获取内容list 替换换行\n 替换</br> 将所有内容连接
        book_content = ''.join(  # 将每行的内容连接起来
            list(map(lambda x:
                     x.replace('\n', '').replace('\u3000\u3000', '\n'),
                     html.xpath('//div[@id="content"]/text()')))
        )
        # 替换81中文网
        # 替换 m.81ZW.ćőm、⑧①.ZW.ćőm、www.八壹.zw.ćőm等
        # .不匹配换行 所以需要使用flag=re.S|re.DOTALL  (.|\n|\r)*  [\s\S]*  (?s)
        # [正则表达式实现跨行匹配](https://www.cnblogs.com/hiyong/p/15376798.html#:~:text=Python中匹配多行方法如下:%20①%20re.DOTALL%20或者%20re.S%20参数%20import%20re,res2%20%3D%20re.findall%20(reg2%2C%20data%2C%20flags%3Dre.DOTALL)%20print%20(res2))
        self._book_content_regexp_compiled_str.sub('', book_content)
        return {"chapter_title": chapter_title, "book_content": book_content}

    # 当下载失败时返回数据
    _download_error_return = {"chapter_title": '', "book_content": ''}

    def _get_txt_book(self, search_book_name, save_dir='./', semaphore=1):
        """
            从81中文网获取书籍

            参数说明:
            ------------------------
            search_book: str
                搜索内容
            save_dir : str
                存储位置
            semaphore : int
                同时连接数量
            ------------------------
        """
        books = self._search_books_by_name(search_book_name)
        # 打印搜索出来的书籍信息
        for k, v in enumerate(books):
            book_info = '名称:' + v['title'] + ' 作者:' + v["author"] + ' 最新章节:' + v["latest_chapters"] + ' 最近更新时间:' + v[
                "update_time"] + ' 类型:' + v["type"] + '\n  简介:' + v["synopsis"]
            book_info = '[' + str(k) + ']' + '<\n  ' + book_info + '\n>' + '\n'
            print(book_info)

        # 选择哪一本书
        select_index = self._input_select_index('输入书籍下标', 0, len(books))
        if select_index == 'exit':
            print('中断程序')
            return
        select_book_url = books[select_index]["url"]

        # 获取书的所有章节链接和书的名字
        book_info = self._get_book_info(self._get_xpath_html(select_book_url))
        book_title = book_info['title']

        # 下载选择的书的全部章节
        contents = {'title': book_title}
        loop = asyncio.new_event_loop()
        semaphore = asyncio.Semaphore()
        tasks = []
        chapters_count = len(book_info['chapters'])
        for k, v in enumerate(book_info['chapters']):
            chapter_url = select_book_url + v
            task = asyncio.ensure_future(self._async_download_file(
                chapter_url,
                semaphore,
                lambda content: self._get_chapter_info(etree.HTML(content)),
                k,
                chapters_count), loop=loop
            )
            tasks.append(task)
        # 收集结果
        result = loop.run_until_complete(asyncio.gather(*tasks))
        loop.close()
        contents['chapters_content'] = result

        print('\n下载完毕 开始写入文件')
        # 写入文件
        if not os.path.exists(save_dir):
            os.mkdir(save_dir)
        with open(os.path.join(save_dir, (contents["title"] + '.txt')), 'w', encoding='utf-8') as f:
            # 写入书籍标题
            f.writelines('\t\t\t\t\t\t\t' + contents["title"] + '\n\n')
            for v in contents['chapters_content']:
                # 写入本章名称
                f.writelines('\n' + v['chapter_title'])
                # 写入内容
                f.write(v['book_content'] + '\n\n')
        print('下载<' + contents["title"] + '>成功')

    def __init__(self, parse_args):
        self._get_txt_book(parse_args.search_str, parse_args.save_dir, parse_args.link_count)


class spider_hanTvn_mv(spider_common):
    _base_url = r'https://www.hantvn.com'

    def __init__(self, parse_args):
        print('如果卡住或超时,请稍等片刻,搜索有3秒间隔限制' + ',将会自动进行3次重连')
        search_str = parse_args.search_str
        save_dir = parse_args.save_dir
        semaphore = parse_args.link_count
        search_results = self._search(search_str)
        for k, v in enumerate(search_results):
            mv_info = '名称:' + v['title'] + ' 导演:' + v['director'] + ' 主演:' + v['protagonist'] + ' 分类:' + v[
                'kind'] + ' 地区:' + v['area'] + ' 年份:' + v['year'] + ' \n  简介:' + v['details']
            mv_info = '[' + str(k) + ']' + '<\n  ' + mv_info + '\n>' + '\n'
            print(mv_info)

        # 选择哪一个mv
        select_index = self._input_select_index('请输入影片下标', 0, len(search_results))
        if select_index == 'exit':
            print('中断程序')
            return
        select_mv_url = self._base_url + search_results[select_index]["link"]
        print('\033[34m影片主页:\033[0m' + select_mv_url)

        link_list = self._download_links(select_mv_url)
        if link_list:
            print('\033[31m不建议使用我这玩意下载,用迅雷啥的比这快多了,影片主页内还提供全部下载 以下是每集链接\033[0m')
            print(''.join([k['episode_name'] + ':' + k['link'] + '\n' for k in link_list]))
            begin, over = self._select_episode(link_list)
            if begin == 'exit':
                print('中断程序')
                return
            select_list = link_list[begin:over]
            # 5M
            chunk_size = 5 * 1024 * 1024
            for k in select_list:
                self._sync_download_mv(k['link'], save_dir, chunk_size)
        else:
            print('转用m3u8')
            select_xpath_html = self._get_xpath_html(select_mv_url, delay_time=0.5)
            mv_name = select_xpath_html.xpath('/html/body/div[1]/div/div/div/div/div/div[2]/h1[@class="title"]/text()')[
                0]
            save_dir = os.path.join(save_dir, mv_name)
            episode_links = self._get_select_provider(select_xpath_html)
            if episode_links == 'exit':
                print('中断程序')
                return
            if episode_links:
                begin, over = self._select_episode(episode_links)
                if begin == 'exit':
                    print('中断程序')
                    return
                select_list = episode_links[begin:over]
                print('可以通过m3u8解析网页在线观看,例如:\n无尽播放器 https://jx.wujinkk.com/dplayer/?url= m3u8地址')
                print('\033[31m不建议使用我这玩意下载,用idm啥的比这快多了,以下是每集链接\033[0m')
                print('以下访问需要很长时间,请耐心等待')
                for k in select_list:
                    k['m3u8_url'] = self._get_m3u8_url(k['href'])
                print(''.join([k['episode_name'] + ':' + k['m3u8_url'] + '\n' for k in select_list]))

                for k in select_list:
                    spider_m3u8(k['m3u8_url'], semaphore, save_dir, mv_name + k['episode_name'] + '.mp4')
            else:
                print('此源内无视频链接')
                return

    def _search(self, search_str):
        search_str = quote(search_str)
        base_url = self._base_url
        url = base_url + r'/search.html?wd=%s&submit=' % search_str
        xpath_result_list = self._get_xpath_html(url).xpath(
            '/html/body/div[1]/div/div[1]/div/div/div[2]/ul/li/div[@class="detail"]')
        result_list = []
        if xpath_result_list:
            for k in xpath_result_list:
                # 标题
                title = k.xpath('string(.//h4/a)')
                # 链接
                link = k.xpath('.//h4/a/@href')[0]
                # 导演
                director = k.xpath('.//p[1]/text()')[0]
                # 主演
                protagonist = k.xpath('.//p[2]/text()')[0]
                info_list = k.xpath('.//p[3]/text()')
                # 分类
                kind = info_list[0]
                # 地区
                area = info_list[1]
                # 年份
                year = info_list[2]
                # 详情
                details = k.xpath('.//p[4]/text()')[0]
                result_list.append({
                    'title': title,
                    'link': link,
                    'director': director,
                    'protagonist': protagonist,
                    'kind': kind,
                    'area': area,
                    'year': year,
                    'details': details,
                })
        return result_list

    def _download_links(self, url):
        xpath_link_list = self._get_xpath_html(url, delay_time=0.5).xpath(
            '/html/body/div[2]/div/div[contains(string(), "迅雷下载")]/div/div[2]/ul[1]/li')
        link_list = []
        if xpath_link_list:
            for k in xpath_link_list:
                episode_name = k.xpath('.//span[1]/a/text()')[0]
                link = k.xpath('.//span[2]/a[3]/@data-text')[0]
                link_list.append({
                    'episode_name': episode_name,
                    'link': link,
                })
        return link_list

    def _get_select_provider(self, html):
        xpath_provider_link_lists = html.xpath('/html/body/div[2]/div/div[2]/div/div[1]/div/ul/li/a')
        provider_link_infos = []
        if xpath_provider_link_lists:
            for k in xpath_provider_link_lists:
                href = k.xpath('.//@href')[0].replace('#', '')
                text = k.xpath('.//text()')[0]
                provider_link_infos.append({
                    'href': href,
                    'text': text
                })
        # 选择一个源
        provider_links = []
        if provider_link_infos:
            print('选择一个源:')
            provider_link_str = ''.join(['  [' + str(k) + '] ' + v['text'] for k, v in enumerate(provider_link_infos)])
            print(provider_link_str)
            select_index = self._input_select_index('请输入视频源的下标', 0, len(provider_link_infos))
            if select_index == 'exit':
                return 'exit'
            select_provider = provider_link_infos[select_index]
            a_links = html.xpath(
                '/html/body/div[2]/div/div[2]/div/div[2]/div[@id="' + select_provider['href'] + '"]/ul/li/a')
            if a_links:
                for k in a_links:
                    href = k.xpath('.//@href')[0]
                    episode_name = k.xpath('.//text()')[0]
                    provider_links.append({
                        'href': self._base_url + href,
                        'episode_name': episode_name,
                    })
        return provider_links

    def _select_episode(self, episode_list):
        # 选择集数
        max_num = len(episode_list)
        print('请选择开始集数和结束集数{0-' + str(max_num - 1) + '}')
        begin = self._input_select_index('请输入开始集数', 0, max_num)
        if begin == 'exit':
            return 'exit', 'exit'
        over = self._input_select_index('请输入结束集数', begin, max_num)
        if over == 'exit':
            return 'exit', 'exit'
        if over == (max_num - 1):
            over = None
        return begin, over

    _extract_json_data_reg = re.compile(r'\{.*\}')
    _m3u8_url_reg = re.compile(r'.*\.m3u8')

    def _get_m3u8_url(self, href):
        # 在此网页的资源中 /static/player/xx.js 的iframe下有访问链接
        # 在/static/js/playerconfig.js?t=20221209 中有一些每个源的部分地址
        # 此处我只简单的解析了视频上方的script标签中json数据所包含的url链接,并判断是否含有.m3u8关键字
        json_code = self._get_xpath_html(href, delay_time=0.5, timeout=20) \
            .xpath('/html/body/div[1]/div/div/div/div[1]/div/div[2]/script[1]/text()')[0]
        if json_code:
            json_data = self._extract_json_data_reg.findall(json_code)[0]
            m3u8_url = json.loads(json_data)['url']
            if not self._m3u8_url_reg.match(m3u8_url):
                print('无法解析获得m3u8 url')
                return ''

            return m3u8_url
        else:
            print('无法解析获得m3u8 url')
            return ''


class spider_m3u8(spider_common):
    """
    初始化M3u8对象

    参数说明:
    ------------------------
    m3u8_url : str
        m3u8的链接
    run_count : int
        同一时间内最多请求的数量
    save_file_dir : str
        保存文件的目录
    save_file_name : str
        保存文件的名称
    ------------------------
    """

    def __init__(self, m3u8_url, run_count, save_file_dir=None, save_file_name=None) -> None:
        self._m3u8_url = m3u8_url
        self._save_file_dir = save_file_dir
        self._save_file_name = save_file_name
        self._run_count = run_count

        self._m3u8_content = None
        self._head_url = None
        self._is_encrypted = None
        self._encrypted_line = None
        self._encrypted_key = None
        self._encrypted_iv = None
        self._urls = None

        if save_file_dir is None:
            save_file_dir = './'
        if save_file_name is None:
            save_file_name = 'mv.mp4'
        if not os.path.exists(save_file_dir):
            os.mkdir(save_file_dir)
        if os.path.exists(os.path.join(save_file_dir, save_file_name)):
            print('该文件已存在')
            # os.remove(os.path.join(save_file_dir, save_file_name))
        else:
            self.run()

    # 获取m3u8内容
    def _get_m3u8_content(self):
        if self._m3u8_content is None:
            # 注意: 需要修改成自己的
            headers = {
                'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
            }
            page = requests.get(self._m3u8_url, headers=headers)
            if page is None:
                raise RuntimeError("can't get url's content")
            self._m3u8_content = page.text
        return self._m3u8_content

    # 获取链接的头,也就是去除xxx.m3u8后缀
    def _get_head_url(self):
        if self._head_url is None:
            find = re.findall(r'(.*/).*\.m3u8?', self._m3u8_url)
            if find:
                self._head_url = find[0]
            else:
                raise RuntimeError("can't get head url")
        return self._head_url

    # 获取是否加密 注意:这里默认为AES的ECB或CBC加密,如果是其他方式,或者不适用,则需要另行修改
    def _get_is_encrypted(self):
        if self._is_encrypted is None:
            if re.match(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content()):
                self._is_encrypted = True
            else:
                self._is_encrypted = False
        return self._is_encrypted

    # 获取m3u8文件中,关于加密信息对应的那一行
    def _get_encrypted_line(self):
        if self._encrypted_line is None:
            find = re.findall(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content())
            if find:
                self._encrypted_line = find[0]
            else:
                raise RuntimeError("can't get encrypted line")
        return self._encrypted_line

    # 获取加密的密匙 未考虑需要填充成16字节128位的倍数
    def _get_encrypted_key(self):
        if self._encrypted_key is None:
            # key密匙 注意:如果不是以y结尾需要修改
            find = re.findall(r'URI="?(.*y)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
            head_url = self._get_head_url()
            key = None
            if find:
                key = find[0]
                # 如果不是完整链接,补充上head_url
                if re.match(head_url, find) is None:
                    key = head_url + key
            else:
                # 如果在m3u8内容中没有找到密匙,试试默认普遍的url规则
                key = head_url + 'key.key'
            req = requests.get(key)
            if req:
                self._encrypted_key = req.content
            else:
                raise RuntimeError("can't get encrypted key")
        return self._encrypted_key

    # 获取加密的偏移值 未考虑填充
    def _get_encrypted_iv(self):
        if self._encrypted_iv is None:
            find = re.findall(r'IV="?(\w*)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
            if find:
                self._encrypted_iv = find[0]
            else:
                raise RuntimeError("can't get encrypted iv")
        return self._encrypted_iv

    # 按照aes(ecb或cbc)解密
    def _get_decrypt_content(self, content, key, iv=None):
        aes = None
        if iv is None:
            aes = AES.new(key, AES.MODE_ECB)
        else:
            aes = AES.new(key, AES.MODE_CBC, iv)

        content = aes.decrypt(content)
        return content

    # 进行解密处理
    def _decrypt_content(self, content):
        # 如果加密了就将其解密
        if self._get_is_encrypted():
            key = self._encrypted_key()
            iv = self._encrypted_iv()
            content = self._get_decrypt_content(content, key, iv)
        return content

    # 使用ciphey智能解密 装不上 算了
    def _decrypt_content_by_cipher(self, content):
        pass

    # 返回链接列表
    def _get_urls(self):
        if self._urls is None:
            urls = re.findall(r'(h.*\.ts)', self._get_m3u8_content())
            if urls:
                # 如果ts不是完整的链接,需要补上head_url
                if not (re.match(r'^http', urls[0]) or re.match(r'^https', urls[0])):
                    head_url = self._get_head_url()
                    urls = list(map(lambda x: head_url + x, urls))
            else:
                raise RuntimeError("can't find urls")
            self._urls = urls
        return self._urls

    # 显示当前下载进度条
    def _process_bar(self, cur, end):
        print("\r", end='')
        print("download file ({}) {}%:".format((str(cur) + '/' + str(end)), int((cur / end) * 100)),
              '▋' * int((cur / end) * 100),
              end='')
        sys.stdout.flush()

    # 异步请求 依赖aiohttp
    async def _async_get_segment_content(self, segment_url, semaphore, index):
        async with semaphore:
            async with aiohttp.ClientSession() as session:
                try:
                    async with session.get(segment_url) as rep:
                        if rep:
                            content = await rep.read()
                            content = self._decrypt_content(content)
                            with self._lock:
                                # Manager.dict的深层数据无法直接修改,只能通过中间变量来改动
                                data = self._data['get']
                                data.append(
                                    {
                                        'start': index,
                                        'end': index,
                                        'content': content
                                    }
                                )
                                self._data['get'] = data
                except (Exception, RuntimeError):
                    # 出错后将空数据插入
                    with self._lock:
                        data = self._data['get']
                        data.append(
                            {
                                'start': index,
                                'end': index,
                                'content': b''
                            }
                        )
                        self._data['get'] = data
                # 打印进度条
                self._process_bar(index + 1, self._data['max_count'])
                return content

    # 获取段视频数据
    def _get_all_contents(self):
        loop = asyncio.new_event_loop()
        semaphore = asyncio.Semaphore(self._run_count)
        urls = self._get_urls()
        tasks = []
        # 很想写成列表生成式 但是报错
        for k, v in enumerate(urls):
            task = asyncio.ensure_future(self._async_get_segment_content(v, semaphore, k), loop=loop)
            tasks.append(task)
        loop.run_until_complete(asyncio.wait(tasks))
        loop.close()

    # 合并视频段
    def _merge_segment_content(self):
        while True:
            if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
                break
            if len(self._data['get']) <= 1:
                time.sleep(2)
                continue
            # 必须在遍历前获取锁,防止在遍历的时候数据被修改了
            with self._lock:
                for k, v in enumerate(self._data['get']):
                    for i, j in enumerate(self._data['get']):
                        # 去除自己
                        if k == i:
                            continue
                        if v['end'] + 1 == j['start']:
                            get_data = self._data['get']
                            get_data[k]['content'] += get_data[i]['content']
                            get_data[k]['end'] = get_data[i]['end']
                            get_data.pop(i)
                            print('\n合并视频-' + str(v['start']) + '<<' + str(j['start']))
                            self._data['get'] = get_data
                            break
                    # 双层循环退出
                    else:
                        continue
                    break
            time.sleep(1)

    # 将段视频内容写入
    def _write_segment_content(self):
        while True:
            if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
                break
            if len(self._data['get']) <= 0:
                continue
            content = None
            with self._lock:
                for k, v in enumerate(self._data['get']):
                    if (self._data['write_count'] is None) or (self._data['write_count'] + 1 == v['start']):
                        content = v['content']
                        self._data['write_count'] = v['end']
                        get_list = self._data['get']
                        get_list.pop(k)
                        print('\n写入视频-' + str(v['start']))
                        self._data['get'] = get_list
                        break
            # 将写入数据放到锁外面执行,不要占用锁
            if content:
                with open(os.path.join(self._save_file_dir, self._save_file_name), 'ab') as f:
                    f.write(content)
            time.sleep(0.05)

    # 运行
    def run(self):
        self._lock = Lock()
        with Manager() as m:
            self._data = m.dict({
                # 视频段数据列表
                'get': [],
                # 视频段数量
                'max_count': len(self._get_urls()),
                # 当前写入磁盘数量
                'write_count': None,
            })
            tasks = (self._get_all_contents, self._merge_segment_content, self._write_segment_content)
            processes = [Process(target=v) for v in tasks]
            for p in processes:
                p.start()
            for p in processes:
                p.join()


if __name__ == '__main__':
    spider_tools()
  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
使用 Python 下载 M3U8 视频需要通过以下几个步骤: 1. 解析 M3U8 文件获取视频分片地址列表:M3U8 文件是一种包含视频分片地址的文本文件,我们需要通过解析 M3U8 文件获取视频分片地址列表。可以使用第三方库如 `m3u8` 进行解析,如下所示: ```python import m3u8 # 加载 M3U8 文件 m3u8_obj = m3u8.load('http://example.com/video.m3u8') # 获取分片地址列表 segments = m3u8_obj.segments ``` 2. 下载视频分片并保存到本地:遍历视频分片地址列表,使用 `requests` 库进行下载,并保存到本地文件中,如下所示: ```python import requests # 遍历分片地址列表,下载并保存到本地 for i, segment in enumerate(segments): url = segment.absolute_uri response = requests.get(url) with open(f'segment{i}.ts', 'wb') as f: f.write(response.content) ``` 其中,`segment{i}.ts` 是保存到本地的文件名,可以根据实际情况进行修改。 3. 合并视频分片成完整视频:将下载视频分片使用 `ffmpeg` 工具进行合并,可以使用 `subprocess` 模块调用系统命令实现,如下所示: ```python import subprocess # 合并视频分片 subprocess.call('ffmpeg -i "concat:segment0.ts|segment1.ts|segment2.ts" -c copy output.mp4', shell=True) ``` 其中,`concat:segment0.ts|segment1.ts|segment2.ts` 是视频分片文件名列表,多个文件用 `|` 分隔,`output.mp4` 是合并后的视频文件名,可以根据实际情况进行修改。 需要注意的是,下载和合并视频分片的过程可能比较耗时,需要根据分片数量和网络情况进行调整,以免出现网络超时等问题。同时,下载使用视频可能涉及到版权等法律问题,请遵守相关法律法规。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值