主要使用了异步爬虫的asyncio操作,如果觉得速度太慢可以添加多线程和多进程,下面就是全部代码。
import requests
from lxml import etree
import urllib.parse
import re
import aiofiles
import aiohttp
import asyncio
import os
os.mkdir("电影")
def get_page_source(url):
req = requests.get(url)
return req.text
def get_href(url):
rep = requests.get(url)
tree = etree.HTML(rep.text)
href = tree.xpath('//div[@class="playlist"]/ul/li/a/@href')
return href
def get_iframe_src(new_href):
rep = requests.get(new_href)
obj = re.compile('var now="(?P<m3u8>.*?)";var pn="wjm3u8";', re.S)
m3u8 = obj.search(rep.text)
return m3u8.group("m3u8")
def get_second_m3u8(m3u8):
req = requests.get(m3u8)
m3u8_list = req.text.split()[-1]
name = m3u8.split("/")[3]
second_m3u8 = urllib.parse.urljoin(m3u8, m3u8_list)
page = get_page_source(second_m3u8)
with open(f"{name}m3u8.text", mode="w", encoding="utf-8") as f:
f.write(page)
return name
async def down_load(name):
with open(f"{name}m3u8.text", mode="r", encoding="utf-8") as f:
lines = f.read().split()
tasks = []
for line in lines:
if line.startswith("#"):
continue
file_name = line.split("/")[-1]
task = asyncio.create_task(down_load_one(line, file_name))
tasks.append(task)
await asyncio.wait(tasks)
async def down_load_one(line, file_name):
while 1:
try:
async with aiohttp.ClientSession() as session:
async with session.get(line) as req:
content = await req.content.read()
async with aiofiles.open(f"./电影/{file_name}", mode="wb") as f1:
await f1.write(content)
print("下载成功", line)
break
except Exception as e:
print("下载失败", e)
def merge_ts(name):
print("正在进行合并")
temp = []
with open(f"{name}m3u8.text", mode="r", encoding="utf-8") as f:
for item in f.read().split():
if item.startswith("#"):
continue
ts_names = item.split("/")[-1]
temp.append(ts_names)
now_chr = os.getcwd()
name_list = []
n = 1
for i in range(len(temp)):
everyone = temp[i]
name_list.append(everyone)
if i != 0 and i % 100 == 0:
merge_name = " + ".join(name_list)
os.system(f"copy /b {merge_name} {n}.ts")
n += 1
name_list = [] # 清空合并列表
merge_name = " + ".join(temp)
os.system(f"copy /b {merge_name} {n}.ts")
n += 1
temp_2 = []
for i in range(1, n):
temp_2.append(f"{i}.ts")
merge_name = " + ".join(temp_2)
os.system(f"copy /b {merge_name} 指环王.mp4")
os.chdir(now_chr)
print("合并完成")
return n
def cut_down(name, n):
print("正在删除多余文件")
with open(f"{name}m3u8.text", mode="r", encoding="utf-8") as f:
for i in f.read().split():
if i.startswith("#"):
continue
names = i.split("/")[-1]
os.remove(f"./电影/{names}")
for num in range(1, n + 1):
try:
os.remove(f"./电影/{num}.ts")
except Exception as e:
print("完成", e)
os.remove(f"{name}m3u8.text")
print("文件删除完成")
def main():
url = "http://www.yaboeye.com/mz/64626.html"
hrefs = get_href(url)
for href in hrefs:
new_href = urllib.parse.urljoin(url, href)
m3u8 = get_iframe_src(new_href)
name = get_second_m3u8(m3u8)
event_loop = asyncio.get_event_loop()
event_loop.run_until_complete(down_load(name))
n = merge_ts(name)
cut_down(name, n)
if __name__ == '__main__':
main()
下载好后不会出现多余的文件,过程可能有些漫长,请耐心等待。