使用python下载简单的m3u8视频
流程
准备一些数据
import re
import requests
# 获取本类的url字段
@property
def url(self):
return self._url
# 获取m3u8内容
def _get_m3u8_content(self):
if self._m3u8_content is None:
headers = {
# 需要修改
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
page = requests.get(self._url, headers=headers)
if page is None:
raise RuntimeError("can't get url's content")
self._m3u8_content = page.text
return self._m3u8_content
# 获取链接的头,也就是去除xxx.m3u8后缀
def _get_head_url(self):
if self._head_url is None:
find = re.findall(r'(.*/).*\.m3u8?', self._url)
if find:
self._head_url = find[0]
else:
raise RuntimeError("can't get head url")
return self._head_url
# 返回链接列表
def _get_urls(self):
if self._urls is None:
urls = re.findall(r'(h.*\.ts)', self._get_m3u8_content())
if urls:
# 如果ts不是完整的链接,需要补上head_url
if not (re.match(r'^http', urls[0]) or re.match(r'^https', urls[0])):
head_url = self._get_head_url()
urls = list(map(lambda x: head_url + x, urls))
else:
raise RuntimeError("can't find urls")
self._urls = urls
return self._urls
解密
# 获取是否加密 注意:这里默认为AES的ECB或CBC加密,如果是其他方式,或者不适用,则需要另行修改
def _get_is_encrypted(self):
if self._is_encrypted is None:
if re.match(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content()):
self._is_encrypted = True
else:
self._is_encrypted = False
return self._is_encrypted
# 获取m3u8文件中,关于加密信息对应的那一行
def _get_encrypted_line(self):
if self._encrypted_line is None:
find = re.findall(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content())
if find:
self._encrypted_line = find[0]
else:
raise RuntimeError("can't get encrypted line")
return self._encrypted_line
# 获取加密的密匙 未考虑需要填充成16字节128位的倍数
def _get_encrypted_key(self):
if self._encrypted_key is None:
# key密匙 注意:如果不是以y结尾需要修改
find = re.findall(r'URI="?(.*y)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
head_url = self._get_head_url()
key = None
if find:
key = find[0]
# 如果不是完整链接,补充上head_url
if re.match(head_url, find) is None:
key = head_url + key
else:
# 如果在m3u8内容中没有找到密匙,试试默认普遍的url规则
key = head_url + 'key.key'
req = requests.get(key)
if req:
self._encrypted_key = req.content
else:
raise RuntimeError("can't get encrypted key")
return self._encrypted_key
# 获取加密的偏移值 未考虑填充
def _get_encrypted_iv(self):
if self._encrypted_iv is None:
find = re.findall(r'IV="?(\w*)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
if find:
self._encrypted_iv = find[0]
else:
raise RuntimeError("can't get encrypted iv")
return self._encrypted_iv
# 按照aes(ecb或cbc)解密
def _get_decrypt_content(self, content, key, iv=None):
aes = None
if iv is None:
aes = AES.new(key, AES.MODE_ECB)
else:
aes = AES.new(key, AES.MODE_CBC, iv)
content = aes.decrypt(content)
return content
# 进行解密处理
def _decrypt_content(self, content):
# 如果加密了就将其解密
if self._get_is_encrypted():
key = self._encrypted_key()
iv = self._encrypted_iv()
content = self._get_decrypt_content(content, key, iv)
return content
多进程异步下载
- 将程序分为三个部分,异步下载段视频、合并段视频、写入到文件
- 使用进程是为了利用多核cpu,每个进程负责一个部分
进程间共用的数据
self._data = m.dict({
# 视频段数据列表
'get': [
# {
# # 开始下标
# 'start': 1,
# # 结束下标
# 'end': 1,
# # 视频段二进制内容
# 'content': b'xxxx',
# }
],
# 视频段总数量
'max_count': len(self._get_urls()),
# 当前写入磁盘文件的数量
'write_count': None,
})
异步下载段视频部分
# 获取异步response,利用递归出错时重复执行
async def _get_rep(self, session, segment_url, count):
if count <= 0:
return None
count -= 1
try:
async with session.get(segment_url) as rep:
return rep
except RuntimeError:
return self._get_rep(session, segment_url, count)
# 显示当前下载进度条
def _process_bar(self, cur, end):
print("\r", end='')
print("download file ({}) {}%:".format((str(cur) + '/' + str(end)), int((cur / end) * 100)),
'▋' * int((cur / end) * 100),
end='')
sys.stdout.flush()
# 异步请求 依赖aiohttp
async def _async_get_segment_content(self, segment_url, semaphore, index):
async with semaphore:
async with aiohttp.ClientSession() as session:
# 如果出错试图重连3次
# rep = self._get_rep(session, segment_url, 3)
try:
async with session.get(segment_url) as rep:
if rep:
content = await rep.read()
content = self._decrypt_content(content)
with self._lock:
# Manager.dict的深层数据无法直接修改,只能通过中间变量来改动
data = self._data['get']
data.append(
{
'start': index,
'end': index,
'content': content
}
)
self._data['get'] = data
except (Exception, RuntimeError):
# 出错后将空数据插入
with self._lock:
data = self._data['get']
data.append(
{
'start': index,
'end': index,
'content': b''
}
)
self._data['get'] = data
# 打印进度条
self._process_bar(index + 1, self._data['max_count'])
return content
# 获取段视频数据
def _get_all_contents(self):
loop = asyncio.new_event_loop()
semaphore = asyncio.Semaphore(self._run_count)
urls = self._get_urls()
tasks = []
# 很想写成列表生成式 但是报错
for k, v in enumerate(urls):
task = asyncio.ensure_future(self._async_get_segment_content(v, semaphore, k), loop=loop)
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
- 遗憾:
- 当session.get发生错误时没有修改成重连几次
- 无法用列表生成式生成任务
- aiohttp.ClientSession()返回的变量没有复用
合并段视频部分
# 合并视频段
def _merge_segment_content(self):
while True:
if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
break
if len(self._data['get']) <= 1:
time.sleep(2)
continue
# 必须在遍历前获取锁,防止在遍历的时候数据被修改了
with self._lock:
for k, v in enumerate(self._data['get']):
for i, j in enumerate(self._data['get']):
# 去除自己
if k == i:
continue
if v['end'] + 1 == j['start']:
get_data = self._data['get']
get_data[k]['content'] += get_data[i]['content']
get_data[k]['end'] = get_data[i]['end']
get_data.pop(i)
#print('\n合并视频-'+str(v['start'])+' << '+str(j['start']))
self._data['get'] = get_data
break
# 双层循环退出
else:
continue
break
time.sleep(1)
写入到文件部分
# 将段视频内容写入
def _write_segment_content(self):
while True:
if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
break
if len(self._data['get']) <= 0:
continue
content = None
with self._lock:
# 每次只写入一个,因为需要修改正在遍历的数据,防止出错
for k, v in enumerate(self._data['get']):
if (self._data['write_count'] is None) or (self._data['write_count'] + 1 == v['start']):
content = v['content']
self._data['write_count'] = v['end']
get_list = self._data['get']
get_list.pop(k)
#print('\n写入视频-' + str(v['start']))
self._data['get'] = get_list
break
# 将写入数据放到锁外面执行,不要占用锁
if content:
with open(os.path.join(self._save_file_dir, self._save_file_name), 'ab') as f:
f.write(content)
time.sleep(0.05)
全部代码
- 注意: 关于请求头部分需要修改成自己的
import asyncio
import os.path
import re
import sys
import time
from multiprocessing import Lock, Manager, Process
import aiohttp
import requests
from Crypto.Cipher import AES
class M3u8(object):
"""
初始化M3u8对象
参数说明:
------------------------
m3u8_url : str
m3u8的链接
run_count : int
同一时间内最多请求的数量
save_file_dir : str
保存文件的目录
save_file_name : str
保存文件的名称
------------------------
"""
def __init__(self, m3u8_url, run_count, save_file_dir=None, save_file_name=None) -> None:
self._m3u8_url = m3u8_url
self._save_file_dir = save_file_dir
self._save_file_name = save_file_name
self._run_count = run_count
self._m3u8_content = None
self._head_url = None
self._is_encrypted = None
self._encrypted_line = None
self._encrypted_key = None
self._encrypted_iv = None
self._urls = None
if save_file_dir is None:
save_file_dir = './'
if save_file_name is None:
save_file_name = 'mv.mp4'
if not os.path.exists(save_file_dir):
os.mkdir(save_file_dir)
# 注意: 这里会删除下载位置的重名文件
if os.path.exists(os.path.join(save_file_dir, save_file_name)):
os.remove(os.path.join(save_file_dir, save_file_name))
# 获取m3u8内容
def _get_m3u8_content(self):
if self._m3u8_content is None:
# 注意: 需要修改成自己的
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
}
page = requests.get(self._m3u8_url, headers=headers)
if page is None:
raise RuntimeError("can't get url's content")
self._m3u8_content = page.text
return self._m3u8_content
# 获取链接的头,也就是去除xxx.m3u8后缀
def _get_head_url(self):
if self._head_url is None:
find = re.findall(r'(.*/).*\.m3u8?', self._m3u8_url)
if find:
self._head_url = find[0]
else:
raise RuntimeError("can't get head url")
return self._head_url
# 获取是否加密 注意:这里默认为AES的ECB或CBC加密,如果是其他方式,或者不适用,则需要另行修改
def _get_is_encrypted(self):
if self._is_encrypted is None:
if re.match(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content()):
self._is_encrypted = True
else:
self._is_encrypted = False
return self._is_encrypted
# 获取m3u8文件中,关于加密信息对应的那一行
def _get_encrypted_line(self):
if self._encrypted_line is None:
find = re.findall(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content())
if find:
self._encrypted_line = find[0]
else:
raise RuntimeError("can't get encrypted line")
return self._encrypted_line
# 获取加密的密匙 未考虑需要填充成16字节128位的倍数
def _get_encrypted_key(self):
if self._encrypted_key is None:
# key密匙 注意:如果不是以y结尾需要修改
find = re.findall(r'URI="?(.*y)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
head_url = self._get_head_url()
key = None
if find:
key = find[0]
# 如果不是完整链接,补充上head_url
if re.match(head_url, find) is None:
key = head_url + key
else:
# 如果在m3u8内容中没有找到密匙,试试默认普遍的url规则
key = head_url + 'key.key'
req = requests.get(key)
if req:
self._encrypted_key = req.content
else:
raise RuntimeError("can't get encrypted key")
return self._encrypted_key
# 获取加密的偏移值 未考虑填充
def _get_encrypted_iv(self):
if self._encrypted_iv is None:
find = re.findall(r'IV="?(\w*)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
if find:
self._encrypted_iv = find[0]
else:
raise RuntimeError("can't get encrypted iv")
return self._encrypted_iv
# 按照aes(ecb或cbc)解密
def _get_decrypt_content(self, content, key, iv=None):
aes = None
if iv is None:
aes = AES.new(key, AES.MODE_ECB)
else:
aes = AES.new(key, AES.MODE_CBC, iv)
content = aes.decrypt(content)
return content
# 进行解密处理
def _decrypt_content(self, content):
# 如果加密了就将其解密
if self._get_is_encrypted():
key = self._encrypted_key()
iv = self._encrypted_iv()
content = self._get_decrypt_content(content, key, iv)
return content
# 返回链接列表
def _get_urls(self):
if self._urls is None:
urls = re.findall(r'(h.*\.ts)', self._get_m3u8_content())
if urls:
# 如果ts不是完整的链接,需要补上head_url
if not (re.match(r'^http', urls[0]) or re.match(r'^https', urls[0])):
head_url = self._get_head_url()
urls = list(map(lambda x: head_url + x, urls))
else:
raise RuntimeError("can't find urls")
self._urls = urls
return self._urls
# 显示当前下载进度条
def _process_bar(self, cur, end):
print("\r", end='')
print("download file ({}) {}%:".format((str(cur) + '/' + str(end)), int((cur / end) * 100)),
'▋' * int((cur / end) * 100),
end='')
sys.stdout.flush()
# 异步请求 依赖aiohttp
async def _async_get_segment_content(self, segment_url, semaphore, index):
async with semaphore:
async with aiohttp.ClientSession() as session:
try:
async with session.get(segment_url) as rep:
if rep:
content = await rep.read()
content = self._decrypt_content(content)
with self._lock:
# Manager.dict的深层数据无法直接修改,只能通过中间变量来改动
data = self._data['get']
data.append(
{
'start': index,
'end': index,
'content': content
}
)
self._data['get'] = data
except (Exception, RuntimeError):
# 出错后将空数据插入
with self._lock:
data = self._data['get']
data.append(
{
'start': index,
'end': index,
'content': b''
}
)
self._data['get'] = data
# 打印进度条
self._process_bar(index + 1, self._data['max_count'])
return content
# 获取段视频数据
def _get_all_contents(self):
loop = asyncio.new_event_loop()
semaphore = asyncio.Semaphore(self._run_count)
urls = self._get_urls()
tasks = []
# 很想写成列表生成式 但是报错
for k, v in enumerate(urls):
task = asyncio.ensure_future(self._async_get_segment_content(v, semaphore, k), loop=loop)
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
# 合并视频段
def _merge_segment_content(self):
while True:
if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
break
if len(self._data['get']) <= 1:
time.sleep(2)
continue
# 必须在遍历前获取锁,防止在遍历的时候数据被修改了
with self._lock:
for k, v in enumerate(self._data['get']):
for i, j in enumerate(self._data['get']):
# 去除自己
if k == i:
continue
if v['end'] + 1 == j['start']:
get_data = self._data['get']
get_data[k]['content'] += get_data[i]['content']
get_data[k]['end'] = get_data[i]['end']
get_data.pop(i)
print('\n合并视频-'+str(v['start'])+'<<'+str(j['start']))
self._data['get'] = get_data
break
# 双层循环退出
else:
continue
break
time.sleep(1)
# 将段视频内容写入
def _write_segment_content(self):
while True:
if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
break
if len(self._data['get']) <= 0:
continue
content = None
with self._lock:
for k, v in enumerate(self._data['get']):
if (self._data['write_count'] is None) or (self._data['write_count'] + 1 == v['start']):
content = v['content']
self._data['write_count'] = v['end']
get_list = self._data['get']
get_list.pop(k)
print('\n写入视频-' + str(v['start']))
self._data['get'] = get_list
break
# 将写入数据放到锁外面执行,不要占用锁
if content:
with open(os.path.join(self._save_file_dir, self._save_file_name), 'ab') as f:
f.write(content)
time.sleep(0.05)
# 运行
def run(self):
self._lock = Lock()
with Manager() as m:
self._data = m.dict({
# 视频段数据列表
'get': [],
# 视频段数量
'max_count': len(self._get_urls()),
# 当前写入磁盘数量
'write_count': None,
})
# 很想使用进程池,但是无法运行
# pool = Pool(3)
# pool.apply_async(self._get_all_contents, args=(self,))
# pool.apply_async(self._merge_segment_content, args=(self,))
# pool.apply_async(self._write_segment_content, args=(self,))
# pool.close()
# pool.join()
tasks = (self._get_all_contents, self._merge_segment_content, self._write_segment_content)
processes = [Process(target=v) for v in tasks]
for p in processes:
p.start()
for p in processes:
p.join()
if __name__ == '__main__':
url = r'xxx/index.m3u8'
m3u8 = M3u8(url, 3, './mv', 'mv.mp4')
m3u8.run()
- 遗憾:
- 无法使用进程池
- 无法直接修改进程共享变量的最深层数据
补充
同步请求
# 下载单个视频
def _download_single_mv(self, url):
req = requests.get(url)
content = None
if req:
content = req.content
else:
raise RuntimeError(f"can't download sigle mv, url : {url}")
# 解密
content = self._decrypt_content(content)
return content
# 将每个ts下载下来,并写入到磁盘文件
def _download_all_ts(self, save_file_dir):
urls = self._get_urls()
for k, url in enumerate(urls):
content = self._download_single_mv(url)
with open(save_file_dir + '/0' + str(k) + '.ts', 'wb') as f:
f.write(content)
# 将每个ts下载后直接合并
def _download_all_ts_and_merge(self, save_file_dir, save_file_name):
urls = self._get_urls()
with open(save_file_dir + '/' + save_file_name, 'ab') as f:
for url in urls:
content = self._download_single_mv(url)
f.write(content)
异步请求
# 将单个段视频写入磁盘
def _async_write_file(self, content, index):
_save_file_dir = self._save_file_dir
with open(_save_file_dir + '/0' + str(index) + '.ts', 'wb') as f:
f.write(content)
# 异步请求 依赖aiohttp
async def _async_get(self, url, semaphore, index):
# print('index: ' + str(index) + ' url: ' + url)
async with semaphore:
async with aiohttp.ClientSession() as session:
async with session.get(url) as rep:
content = await rep.read()
content = self._decrypt_content(content)
self._async_write_file(content, index)
return content
# 异步执行
def _async_run(self, count):
# loop = asyncio.get_event_loop()
loop = asyncio.new_event_loop()
# asyncio.set_event_loop(loop)
semaphore = asyncio.Semaphore(count)
tasks = []
urls = self._get_urls()
for k, url in enumerate(urls):
task = asyncio.ensure_future(self._async_get(url, semaphore, k), loop=loop)
# task = asyncio.create_task(self._async_get(url, semaphore, k))
tasks.append(task)
# tasks = [asyncio.ensure_future(self._async_get(url, semaphore, k) for k, url in enumerate(self._get_urls()))]
loop.run_until_complete(asyncio.wait(tasks))
合并
- 合并的4种方式
# 合并后删除ts文件
def _del_all_ts_files(self, del_files_dir):
files = [x for x in os.listdir(del_files_dir) if os.path.isfile(del_files_dir + '/' + x) and os.path.splitext(del_files_dir + '/' + x)[1] == '.ts']
for file in files:
os.remove(del_files_dir + '/' + file)
- zipfile
# zipfile 合并文件 不使用压缩
def _zipfile_merge(self, save_file_dir, save_file_name, is_del=True):
# 可能需要处理下合并顺序
files = os.listdir(save_file_dir)
with zipfile.ZipFile(os.path.join(save_file_dir, save_file_name), 'a') as z:
for file in files:
z.write(save_file_dir + '/' + file)
if is_del is True:
self._del_all_ts_files(save_file_dir)
- copy 命令 合并文件
# copy 命令 合并文件
def _copy_command_merge(self, save_file_dir, save_file_name, is_del=True):
file_name = save_file_name
cur_dir = os.path.abspath('.')
os.chdir(save_file_dir)
# 可能会因为顺序导致出问题
os.system("copy /b *.ts new.tmp")
os.rename("new.tmp", file_name)
# 是否删除文件
if is_del is True:
# os.system("del /Q *.ts")
os.chdir(cur_dir)
self._del_all_ts_files(save_file_dir)
- ffmpeg 合并文件
# ffmpeg 合并 未测试过不确定是否正确
def _ffmpeg_merge(self, save_file_dir, save_file_name, is_del=True):
# 准备好index.m3u8文件
m3u8_content = self._get_m3u8_content()
# 删除加密行
m3u8_content = re.sub(r'(#EXT-X-KEY:METHOD.*\n)', '', m3u8_content)
# 将所有视频段替换成本地ts文件链接
urls = re.findall(r'(h.*\.ts)', m3u8_content)
for k, v in enumerate(urls):
m3u8_content = re.sub(v, k, m3u8_content, count=1)
# 保存当前目录
cur_dir = os.path.abspath('.')
# 切换到保存文件目录
os.chdir(save_file_dir)
# 将m3u8写入文件
with open('index.m3u8', 'wb') as f:
f.write(m3u8_content)
# 执行合并命令
os.system("ffmpeg -i index.m3u8 -c copy " + save_file_name)
# 切回目录
os.chdir(cur_dir)
# 是否删除ts文件
if is_del is True:
self._del_all_ts_files(save_file_dir)
- 下载段视频到内存后直接追加到磁盘中的视频文件,不使用临时.ts文件
后记
把整个视频下载下来然后再进行解密比较好
后来又写了段,时间为 2022-12-14 17:18,写到网易云搜索字符加密然后发送请求这一步不想写了,就放弃了。现在又全都忘得差不多了,只留下了不知道什么鬼样的代码,等以后兴趣再来了再去弄把。
#!/usr/bin/env python3
# 仅用于学习参考,请体谅免费资源提供网站,不要发送大量请求
# 需要下载的依赖包
# python -m pip install fake-useragent lxml requests aiohttp
import argparse
import asyncio
import json
import os
import random
import re
import sys
import time
from enum import Enum
from functools import reduce
from pprint import pprint
from urllib.parse import quote, urlparse, urlsplit
from multiprocessing import Lock, Manager, Process
from Crypto.Cipher import AES
import aiohttp
import requests
from requests.exceptions import ReadTimeout
from fake_useragent import UserAgent, UserAgentError
from lxml import etree
class spider_tools(object):
def __init__(self):
option = Enum('spider_option', ('bayizww', 'neteasemusic', 'hanTvn'))
hint = {option.bayizww: '八一中文网 小说', option.neteasemusic: '网易云音乐 音乐', option.hanTvn: '韩剧Tvn 视频'}
option_str = reduce(lambda x, y: x + y, ['\n [' + mem.name + '] ' + hint[mem] + '、' for mem in option])
parse = argparse.ArgumentParser()
parse.add_argument('spider_option', help='选择一个下载选项:' + option_str, type=str)
parse.add_argument('search_str', help='搜索的内容', type=str)
parse.add_argument('-d', '--save_dir', type=str, help='存储的位置', default='./')
parse.add_argument('-c', '--link_count', type=int, help='同时连接的数量', default=1)
if len(sys.argv) >= 2:
spider_option = sys.argv[1]
else:
print('选择一个下载选项:' + option_str)
return
if spider_option == option.bayizww.name:
parse_args = parse.parse_args(sys.argv[1:])
spider_81txt(parse_args)
elif spider_option == option.neteasemusic.name:
# 添加更多参数 解析 然后将解析的参数传入
parse_args = parse.parse_args(sys.argv[1:])
elif spider_option == option.hanTvn.name:
parse_args = parse.parse_args(sys.argv[1:])
spider_hanTvn_mv(parse_args)
ua = UserAgent()
class spider_common(object):
@staticmethod
def _headers():
try:
return {'user-agent': ua[random.choice(ua.browsers)]}
except UserAgentError as e:
return {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/107.0.0.0 Safari/537.36 Edg/107.0.1418.62'
}
# 显示当前下载进度条
@staticmethod
def _process_bar(cur, end):
print("\r", end='')
print("download file ({}) {}%:".format((str(cur) + '/' + str(end)), int((cur / end) * 100)),
'▋' * int((cur / end) * 100),
end='')
sys.stdout.flush()
async def _async_download_file(self, url, link_count, func, cur, end, headers=_headers()):
async with link_count:
async with aiohttp.ClientSession() as session:
try:
async with session.get(url, headers=headers) as rep:
content = await rep.read()
if func:
self._process_bar(cur, end)
return func(content)
else:
raise Exception('不存在执行方法func')
except (Exception, RuntimeError) as e:
print('\n获取' + url + '内容失败,请选择处理方式:')
print(e)
print('0: 再次重新获取\n1:丢失此内容')
get_option = self._input_select_index('请选择处理方式', 0, 2)
if get_option == 1:
return self._async_download_file(url, link_count, func, cur, end)
else:
return self._download_error_return
# 虽然是精神病但没关系16.mp4 -> 虽然是精神病但没关系
_del_url_episode_reg = re.compile(r'第?\w+\.mp4')
def _sync_download_mv(self, url, save_dir, chunk_size):
url_base_name = os.path.basename(url)
url = url.replace(url_base_name, quote(url_base_name))
tmp_count = 1
sleep_time = 3.5
while tmp_count < 4:
try:
response = requests.get(url=url, stream=True, timeout=1, headers=self._headers())
if response.status_code != 200:
print('获取下载视频响应失败,错误码:' + str(response.status_code))
raise ReadTimeout
else:
break
except ReadTimeout:
print('获取下载视频响应超时,正在尝试重连')
time.sleep(sleep_time)
tmp_count += 1
print('总大小:' + str(int(response.headers['Content-Length']) / 1024 / 1024) + 'M')
print('每次读取大小: ' + str(chunk_size / 1024 / 1024) + 'M')
if not response or response.status_code != 200:
print('无法获取下载视频响应,正在退出')
if response and response.close:
response.close()
return
save_dir = os.path.join(save_dir, self._del_url_episode_reg.sub('', url_base_name))
if not os.path.exists(save_dir):
os.mkdir(save_dir)
save_path = os.path.join(save_dir, url_base_name)
if os.path.exists(save_path):
print('文件已存在:' + save_path)
return
with open(save_path, mode='wb') as f:
print('开始下载:' + url_base_name)
for chunk in response.iter_content(chunk_size):
if chunk:
self._process_bar(os.path.getsize(save_path), int(response.headers['Content-Length']))
f.write(chunk)
if os.path.getsize(save_path) > 10 * 1024 * 1024:
return
response.close()
# 获取能够被xpath的对象
@staticmethod
def _get_xpath_html(url, headers=_headers(), delay_time=None, timeout=1):
print('timeout:' + str(timeout))
if delay_time:
time.sleep(delay_time)
tmp_count = 1
sleep_time = 3.5
while tmp_count < 4:
try:
with requests.get(url, headers=headers, timeout=timeout) as response:
if response.status_code == 200:
return etree.HTML(response.content, parser=etree.HTMLParser(encoding='utf-8'))
else:
print('访问失败,错误码为:' + str(response.status_code))
raise ReadTimeout
except ReadTimeout:
print('访问超时,正在尝试第' + str(tmp_count) + '次重连' + ',等待' + str(sleep_time) + '后开始发送请求')
time.sleep(sleep_time)
tmp_count += 1
raise Exception('尝试重连' + str(tmp_count - 1) + '次后,也无法获取到响应')
return
@staticmethod
def _input_select_index(show_text, start, end):
while True:
try:
input_str = input(show_text + ':')
if input_str == 'exit' or input_str == '^C':
return 'exit'
get_index = int(input_str)
if not (get_index in range(start, end)):
raise ValueError('请输入在{' + str(start) + ',' + str(end - 1) + '}范围内的值')
else:
return get_index
except (KeyboardInterrupt, ValueError) as e:
print('\n错误:请输入正确的数字')
print(e)
class spider_81txt(spider_common):
# 搜索书籍,并返回搜索的结果
def _search_books_by_name(self, search_str):
if search_str is None:
raise Exception("search_str is None")
# 搜索暂不支持翻页
url = r'https://www.81zw.com/search.php?keyword='
# 将中文转换为url编码
# [Python使汉字转换成url可识别的编码](https://blog.csdn.net/qq_33267306/article/details/121677064#:~:text=Python使汉字转换成url可识别的编码%201%20import%20urllib.parse%202%20t%20%3D%20'爬虫工程师',(s)%20%23将url可识别的编码转换成汉字%205%20print%20(s)%206%20print%20(f))
search_url = url + quote(search_str)
search_result_books = self._get_xpath_html(search_url).xpath(
'//div[@class="result-list gameblock-result-list"]/div')
books_info = []
for k in search_result_books:
info_div = k.xpath('.//div[@class="result-game-item-detail"]')[0]
# 标题
book_title = info_div.xpath('.//h3/a/text()')[0].strip()
# 网址
book_url = info_div.xpath('.//h3/a/@href')[0]
book_url = re.sub('/search.*', '', url) + book_url
# 简介
synopsis = info_div.xpath('.//p[@class="result-game-item-desc"]/text()')[0]
# 其他信息
book_other_info = info_div.xpath('.//div[@class="result-game-item-info"]/p')
# 作者
book_author = book_other_info[0].xpath('.//span')[1].xpath('.//text()')[0]
# 类型
book_type = book_other_info[1].xpath('.//span')[1].xpath('.//text()')[0]
# 更新时间
book_update_time = book_other_info[2].xpath('.//span')[1].xpath('.//text()')[0]
# 最新章节
book_latest_chapters = book_other_info[3].xpath('.//a/text()')[0]
books_info.append({
"title": book_title,
"url": book_url,
"synopsis": synopsis,
"author": book_author,
"type": book_type,
"update_time": book_update_time,
"latest_chapters": book_latest_chapters,
})
return books_info
# 获取书籍的名字和所有章节的链接
def _get_book_info(self, html):
title = html.xpath("//div[@id='info']/h1/text()")[0].strip()
# book/61100/256885.html > 256885.html 只留下最后面的网址
chapters = list(map(lambda x: re.sub(r'/\w+/\d+/', '', x), html.xpath("//div[@id='list']/dl/dd/a/@href")))
return {"title": title, "chapters": chapters}
_book_content_regexp_compiled_str = re.compile(r'(([8|八|⑧][1|(一|壹)|①])中文[网|網])|'
r'(([m|(www)]\.)*([8|八|⑧][1|(一|壹)|①])([z|Z][w|W])\.([ć|c]|[ő|o]|m))|'
r'(网页版章节内容慢,请下载爱阅小说app阅读最新内容(.|\n|\r)*)|'
r'(本章未完,点击\[下一页\]继续阅读-->>[\n|\r].*)|'
r'(.*?推荐一本书[.|\n|\r]*)|'
r'(\?此章节正在\?ww\.om努力更新ing,请稍后刷新访问[.|\n|\r]*)'
)
# 获取第n章的标题及本章的内容
def _get_chapter_info(self, html):
# 获取章节标题 strip删除空格换行(可能没有空格换行啥的需要删 不过无所谓)
chapter_title = html.xpath("//div[@class='bookname']/h1/text()")[0].strip()
# 获取内容list 替换换行\n 替换</br> 将所有内容连接
book_content = ''.join( # 将每行的内容连接起来
list(map(lambda x:
x.replace('\n', '').replace('\u3000\u3000', '\n'),
html.xpath('//div[@id="content"]/text()')))
)
# 替换81中文网
# 替换 m.81ZW.ćőm、⑧①.ZW.ćőm、www.八壹.zw.ćőm等
# .不匹配换行 所以需要使用flag=re.S|re.DOTALL (.|\n|\r)* [\s\S]* (?s)
# [正则表达式实现跨行匹配](https://www.cnblogs.com/hiyong/p/15376798.html#:~:text=Python中匹配多行方法如下:%20①%20re.DOTALL%20或者%20re.S%20参数%20import%20re,res2%20%3D%20re.findall%20(reg2%2C%20data%2C%20flags%3Dre.DOTALL)%20print%20(res2))
self._book_content_regexp_compiled_str.sub('', book_content)
return {"chapter_title": chapter_title, "book_content": book_content}
# 当下载失败时返回数据
_download_error_return = {"chapter_title": '', "book_content": ''}
def _get_txt_book(self, search_book_name, save_dir='./', semaphore=1):
"""
从81中文网获取书籍
参数说明:
------------------------
search_book: str
搜索内容
save_dir : str
存储位置
semaphore : int
同时连接数量
------------------------
"""
books = self._search_books_by_name(search_book_name)
# 打印搜索出来的书籍信息
for k, v in enumerate(books):
book_info = '名称:' + v['title'] + ' 作者:' + v["author"] + ' 最新章节:' + v["latest_chapters"] + ' 最近更新时间:' + v[
"update_time"] + ' 类型:' + v["type"] + '\n 简介:' + v["synopsis"]
book_info = '[' + str(k) + ']' + '<\n ' + book_info + '\n>' + '\n'
print(book_info)
# 选择哪一本书
select_index = self._input_select_index('输入书籍下标', 0, len(books))
if select_index == 'exit':
print('中断程序')
return
select_book_url = books[select_index]["url"]
# 获取书的所有章节链接和书的名字
book_info = self._get_book_info(self._get_xpath_html(select_book_url))
book_title = book_info['title']
# 下载选择的书的全部章节
contents = {'title': book_title}
loop = asyncio.new_event_loop()
semaphore = asyncio.Semaphore()
tasks = []
chapters_count = len(book_info['chapters'])
for k, v in enumerate(book_info['chapters']):
chapter_url = select_book_url + v
task = asyncio.ensure_future(self._async_download_file(
chapter_url,
semaphore,
lambda content: self._get_chapter_info(etree.HTML(content)),
k,
chapters_count), loop=loop
)
tasks.append(task)
# 收集结果
result = loop.run_until_complete(asyncio.gather(*tasks))
loop.close()
contents['chapters_content'] = result
print('\n下载完毕 开始写入文件')
# 写入文件
if not os.path.exists(save_dir):
os.mkdir(save_dir)
with open(os.path.join(save_dir, (contents["title"] + '.txt')), 'w', encoding='utf-8') as f:
# 写入书籍标题
f.writelines('\t\t\t\t\t\t\t' + contents["title"] + '\n\n')
for v in contents['chapters_content']:
# 写入本章名称
f.writelines('\n' + v['chapter_title'])
# 写入内容
f.write(v['book_content'] + '\n\n')
print('下载<' + contents["title"] + '>成功')
def __init__(self, parse_args):
self._get_txt_book(parse_args.search_str, parse_args.save_dir, parse_args.link_count)
class spider_hanTvn_mv(spider_common):
_base_url = r'https://www.hantvn.com'
def __init__(self, parse_args):
print('如果卡住或超时,请稍等片刻,搜索有3秒间隔限制' + ',将会自动进行3次重连')
search_str = parse_args.search_str
save_dir = parse_args.save_dir
semaphore = parse_args.link_count
search_results = self._search(search_str)
for k, v in enumerate(search_results):
mv_info = '名称:' + v['title'] + ' 导演:' + v['director'] + ' 主演:' + v['protagonist'] + ' 分类:' + v[
'kind'] + ' 地区:' + v['area'] + ' 年份:' + v['year'] + ' \n 简介:' + v['details']
mv_info = '[' + str(k) + ']' + '<\n ' + mv_info + '\n>' + '\n'
print(mv_info)
# 选择哪一个mv
select_index = self._input_select_index('请输入影片下标', 0, len(search_results))
if select_index == 'exit':
print('中断程序')
return
select_mv_url = self._base_url + search_results[select_index]["link"]
print('\033[34m影片主页:\033[0m' + select_mv_url)
link_list = self._download_links(select_mv_url)
if link_list:
print('\033[31m不建议使用我这玩意下载,用迅雷啥的比这快多了,影片主页内还提供全部下载 以下是每集链接\033[0m')
print(''.join([k['episode_name'] + ':' + k['link'] + '\n' for k in link_list]))
begin, over = self._select_episode(link_list)
if begin == 'exit':
print('中断程序')
return
select_list = link_list[begin:over]
# 5M
chunk_size = 5 * 1024 * 1024
for k in select_list:
self._sync_download_mv(k['link'], save_dir, chunk_size)
else:
print('转用m3u8')
select_xpath_html = self._get_xpath_html(select_mv_url, delay_time=0.5)
mv_name = select_xpath_html.xpath('/html/body/div[1]/div/div/div/div/div/div[2]/h1[@class="title"]/text()')[
0]
save_dir = os.path.join(save_dir, mv_name)
episode_links = self._get_select_provider(select_xpath_html)
if episode_links == 'exit':
print('中断程序')
return
if episode_links:
begin, over = self._select_episode(episode_links)
if begin == 'exit':
print('中断程序')
return
select_list = episode_links[begin:over]
print('可以通过m3u8解析网页在线观看,例如:\n无尽播放器 https://jx.wujinkk.com/dplayer/?url= m3u8地址')
print('\033[31m不建议使用我这玩意下载,用idm啥的比这快多了,以下是每集链接\033[0m')
print('以下访问需要很长时间,请耐心等待')
for k in select_list:
k['m3u8_url'] = self._get_m3u8_url(k['href'])
print(''.join([k['episode_name'] + ':' + k['m3u8_url'] + '\n' for k in select_list]))
for k in select_list:
spider_m3u8(k['m3u8_url'], semaphore, save_dir, mv_name + k['episode_name'] + '.mp4')
else:
print('此源内无视频链接')
return
def _search(self, search_str):
search_str = quote(search_str)
base_url = self._base_url
url = base_url + r'/search.html?wd=%s&submit=' % search_str
xpath_result_list = self._get_xpath_html(url).xpath(
'/html/body/div[1]/div/div[1]/div/div/div[2]/ul/li/div[@class="detail"]')
result_list = []
if xpath_result_list:
for k in xpath_result_list:
# 标题
title = k.xpath('string(.//h4/a)')
# 链接
link = k.xpath('.//h4/a/@href')[0]
# 导演
director = k.xpath('.//p[1]/text()')[0]
# 主演
protagonist = k.xpath('.//p[2]/text()')[0]
info_list = k.xpath('.//p[3]/text()')
# 分类
kind = info_list[0]
# 地区
area = info_list[1]
# 年份
year = info_list[2]
# 详情
details = k.xpath('.//p[4]/text()')[0]
result_list.append({
'title': title,
'link': link,
'director': director,
'protagonist': protagonist,
'kind': kind,
'area': area,
'year': year,
'details': details,
})
return result_list
def _download_links(self, url):
xpath_link_list = self._get_xpath_html(url, delay_time=0.5).xpath(
'/html/body/div[2]/div/div[contains(string(), "迅雷下载")]/div/div[2]/ul[1]/li')
link_list = []
if xpath_link_list:
for k in xpath_link_list:
episode_name = k.xpath('.//span[1]/a/text()')[0]
link = k.xpath('.//span[2]/a[3]/@data-text')[0]
link_list.append({
'episode_name': episode_name,
'link': link,
})
return link_list
def _get_select_provider(self, html):
xpath_provider_link_lists = html.xpath('/html/body/div[2]/div/div[2]/div/div[1]/div/ul/li/a')
provider_link_infos = []
if xpath_provider_link_lists:
for k in xpath_provider_link_lists:
href = k.xpath('.//@href')[0].replace('#', '')
text = k.xpath('.//text()')[0]
provider_link_infos.append({
'href': href,
'text': text
})
# 选择一个源
provider_links = []
if provider_link_infos:
print('选择一个源:')
provider_link_str = ''.join([' [' + str(k) + '] ' + v['text'] for k, v in enumerate(provider_link_infos)])
print(provider_link_str)
select_index = self._input_select_index('请输入视频源的下标', 0, len(provider_link_infos))
if select_index == 'exit':
return 'exit'
select_provider = provider_link_infos[select_index]
a_links = html.xpath(
'/html/body/div[2]/div/div[2]/div/div[2]/div[@id="' + select_provider['href'] + '"]/ul/li/a')
if a_links:
for k in a_links:
href = k.xpath('.//@href')[0]
episode_name = k.xpath('.//text()')[0]
provider_links.append({
'href': self._base_url + href,
'episode_name': episode_name,
})
return provider_links
def _select_episode(self, episode_list):
# 选择集数
max_num = len(episode_list)
print('请选择开始集数和结束集数{0-' + str(max_num - 1) + '}')
begin = self._input_select_index('请输入开始集数', 0, max_num)
if begin == 'exit':
return 'exit', 'exit'
over = self._input_select_index('请输入结束集数', begin, max_num)
if over == 'exit':
return 'exit', 'exit'
if over == (max_num - 1):
over = None
return begin, over
_extract_json_data_reg = re.compile(r'\{.*\}')
_m3u8_url_reg = re.compile(r'.*\.m3u8')
def _get_m3u8_url(self, href):
# 在此网页的资源中 /static/player/xx.js 的iframe下有访问链接
# 在/static/js/playerconfig.js?t=20221209 中有一些每个源的部分地址
# 此处我只简单的解析了视频上方的script标签中json数据所包含的url链接,并判断是否含有.m3u8关键字
json_code = self._get_xpath_html(href, delay_time=0.5, timeout=20) \
.xpath('/html/body/div[1]/div/div/div/div[1]/div/div[2]/script[1]/text()')[0]
if json_code:
json_data = self._extract_json_data_reg.findall(json_code)[0]
m3u8_url = json.loads(json_data)['url']
if not self._m3u8_url_reg.match(m3u8_url):
print('无法解析获得m3u8 url')
return ''
return m3u8_url
else:
print('无法解析获得m3u8 url')
return ''
class spider_m3u8(spider_common):
"""
初始化M3u8对象
参数说明:
------------------------
m3u8_url : str
m3u8的链接
run_count : int
同一时间内最多请求的数量
save_file_dir : str
保存文件的目录
save_file_name : str
保存文件的名称
------------------------
"""
def __init__(self, m3u8_url, run_count, save_file_dir=None, save_file_name=None) -> None:
self._m3u8_url = m3u8_url
self._save_file_dir = save_file_dir
self._save_file_name = save_file_name
self._run_count = run_count
self._m3u8_content = None
self._head_url = None
self._is_encrypted = None
self._encrypted_line = None
self._encrypted_key = None
self._encrypted_iv = None
self._urls = None
if save_file_dir is None:
save_file_dir = './'
if save_file_name is None:
save_file_name = 'mv.mp4'
if not os.path.exists(save_file_dir):
os.mkdir(save_file_dir)
if os.path.exists(os.path.join(save_file_dir, save_file_name)):
print('该文件已存在')
# os.remove(os.path.join(save_file_dir, save_file_name))
else:
self.run()
# 获取m3u8内容
def _get_m3u8_content(self):
if self._m3u8_content is None:
# 注意: 需要修改成自己的
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)'
}
page = requests.get(self._m3u8_url, headers=headers)
if page is None:
raise RuntimeError("can't get url's content")
self._m3u8_content = page.text
return self._m3u8_content
# 获取链接的头,也就是去除xxx.m3u8后缀
def _get_head_url(self):
if self._head_url is None:
find = re.findall(r'(.*/).*\.m3u8?', self._m3u8_url)
if find:
self._head_url = find[0]
else:
raise RuntimeError("can't get head url")
return self._head_url
# 获取是否加密 注意:这里默认为AES的ECB或CBC加密,如果是其他方式,或者不适用,则需要另行修改
def _get_is_encrypted(self):
if self._is_encrypted is None:
if re.match(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content()):
self._is_encrypted = True
else:
self._is_encrypted = False
return self._is_encrypted
# 获取m3u8文件中,关于加密信息对应的那一行
def _get_encrypted_line(self):
if self._encrypted_line is None:
find = re.findall(r'(#EXT-X-KEY:METHOD.*\n)', self._get_m3u8_content())
if find:
self._encrypted_line = find[0]
else:
raise RuntimeError("can't get encrypted line")
return self._encrypted_line
# 获取加密的密匙 未考虑需要填充成16字节128位的倍数
def _get_encrypted_key(self):
if self._encrypted_key is None:
# key密匙 注意:如果不是以y结尾需要修改
find = re.findall(r'URI="?(.*y)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
head_url = self._get_head_url()
key = None
if find:
key = find[0]
# 如果不是完整链接,补充上head_url
if re.match(head_url, find) is None:
key = head_url + key
else:
# 如果在m3u8内容中没有找到密匙,试试默认普遍的url规则
key = head_url + 'key.key'
req = requests.get(key)
if req:
self._encrypted_key = req.content
else:
raise RuntimeError("can't get encrypted key")
return self._encrypted_key
# 获取加密的偏移值 未考虑填充
def _get_encrypted_iv(self):
if self._encrypted_iv is None:
find = re.findall(r'IV="?(\w*)"?.*\n', self._get_encrypted_line(), re.IGNORECASE)
if find:
self._encrypted_iv = find[0]
else:
raise RuntimeError("can't get encrypted iv")
return self._encrypted_iv
# 按照aes(ecb或cbc)解密
def _get_decrypt_content(self, content, key, iv=None):
aes = None
if iv is None:
aes = AES.new(key, AES.MODE_ECB)
else:
aes = AES.new(key, AES.MODE_CBC, iv)
content = aes.decrypt(content)
return content
# 进行解密处理
def _decrypt_content(self, content):
# 如果加密了就将其解密
if self._get_is_encrypted():
key = self._encrypted_key()
iv = self._encrypted_iv()
content = self._get_decrypt_content(content, key, iv)
return content
# 使用ciphey智能解密 装不上 算了
def _decrypt_content_by_cipher(self, content):
pass
# 返回链接列表
def _get_urls(self):
if self._urls is None:
urls = re.findall(r'(h.*\.ts)', self._get_m3u8_content())
if urls:
# 如果ts不是完整的链接,需要补上head_url
if not (re.match(r'^http', urls[0]) or re.match(r'^https', urls[0])):
head_url = self._get_head_url()
urls = list(map(lambda x: head_url + x, urls))
else:
raise RuntimeError("can't find urls")
self._urls = urls
return self._urls
# 显示当前下载进度条
def _process_bar(self, cur, end):
print("\r", end='')
print("download file ({}) {}%:".format((str(cur) + '/' + str(end)), int((cur / end) * 100)),
'▋' * int((cur / end) * 100),
end='')
sys.stdout.flush()
# 异步请求 依赖aiohttp
async def _async_get_segment_content(self, segment_url, semaphore, index):
async with semaphore:
async with aiohttp.ClientSession() as session:
try:
async with session.get(segment_url) as rep:
if rep:
content = await rep.read()
content = self._decrypt_content(content)
with self._lock:
# Manager.dict的深层数据无法直接修改,只能通过中间变量来改动
data = self._data['get']
data.append(
{
'start': index,
'end': index,
'content': content
}
)
self._data['get'] = data
except (Exception, RuntimeError):
# 出错后将空数据插入
with self._lock:
data = self._data['get']
data.append(
{
'start': index,
'end': index,
'content': b''
}
)
self._data['get'] = data
# 打印进度条
self._process_bar(index + 1, self._data['max_count'])
return content
# 获取段视频数据
def _get_all_contents(self):
loop = asyncio.new_event_loop()
semaphore = asyncio.Semaphore(self._run_count)
urls = self._get_urls()
tasks = []
# 很想写成列表生成式 但是报错
for k, v in enumerate(urls):
task = asyncio.ensure_future(self._async_get_segment_content(v, semaphore, k), loop=loop)
tasks.append(task)
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
# 合并视频段
def _merge_segment_content(self):
while True:
if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
break
if len(self._data['get']) <= 1:
time.sleep(2)
continue
# 必须在遍历前获取锁,防止在遍历的时候数据被修改了
with self._lock:
for k, v in enumerate(self._data['get']):
for i, j in enumerate(self._data['get']):
# 去除自己
if k == i:
continue
if v['end'] + 1 == j['start']:
get_data = self._data['get']
get_data[k]['content'] += get_data[i]['content']
get_data[k]['end'] = get_data[i]['end']
get_data.pop(i)
print('\n合并视频-' + str(v['start']) + '<<' + str(j['start']))
self._data['get'] = get_data
break
# 双层循环退出
else:
continue
break
time.sleep(1)
# 将段视频内容写入
def _write_segment_content(self):
while True:
if (self._data['write_count'] is not None) and (self._data['write_count'] + 1 >= self._data['max_count']):
break
if len(self._data['get']) <= 0:
continue
content = None
with self._lock:
for k, v in enumerate(self._data['get']):
if (self._data['write_count'] is None) or (self._data['write_count'] + 1 == v['start']):
content = v['content']
self._data['write_count'] = v['end']
get_list = self._data['get']
get_list.pop(k)
print('\n写入视频-' + str(v['start']))
self._data['get'] = get_list
break
# 将写入数据放到锁外面执行,不要占用锁
if content:
with open(os.path.join(self._save_file_dir, self._save_file_name), 'ab') as f:
f.write(content)
time.sleep(0.05)
# 运行
def run(self):
self._lock = Lock()
with Manager() as m:
self._data = m.dict({
# 视频段数据列表
'get': [],
# 视频段数量
'max_count': len(self._get_urls()),
# 当前写入磁盘数量
'write_count': None,
})
tasks = (self._get_all_contents, self._merge_segment_content, self._write_segment_content)
processes = [Process(target=v) for v in tasks]
for p in processes:
p.start()
for p in processes:
p.join()
if __name__ == '__main__':
spider_tools()