下载大型资源支持断点续传、下载进度条,强大的 async,异步批量下载资源,榨干带宽,自身cpu内存消耗极小,恐怖如斯。主要使用 aiohttp、aiofiles。
import asyncio
import json
from urllib.parse import urljoin
import aiohttp
import aiofiles
from aiofiles import os
from bs4 import BeautifulSoup
from tqdm import tqdm
from rich import print
import random
import misaka
import unicodedata
import os as sync_os
import sys
from functools import wraps
import csv
def with_retries(max_tries: int = 5, retries_sleep_second: float = 1):
def wrapper(async_function):
@wraps(async_function)
async def wrapped(*args, **kwargs):
tries = 1
while tries <= max_tries:
try:
return await async_function(*args, **kwargs)
except Exception as e:
print(f"Aiohttp ClientError: {str(e)}, tries: {tries}")
tries += 1
await asyncio.sleep(retries_sleep_second)
else:
raise TimeoutError("Reached aiohttp max tries")
return wrapped
return wrapper
class SN(asyncio.Future):
BASE_URL = "http://ip"
URL = f"{BASE_URL}/api/mp/auth/login"
USERNAME = "**"
PASSWORD = "**"
token = ""
cookie_value = ""
participant_list_data = []
resource_url = []
questions = []
def get_session(self):
headers: dict[str, str] = {"Authorization": self.token}
timeout = aiohttp.ClientTimeout(total=3600 * 48)
self.client = aiohttp.ClientSession(
headers=headers,
timeout=timeout
)
self.RECORD_TXT = sync_os.path.basename(sys.argv[0]).replace("py", "txt")
if not sync_os.path.exists(self.RECORD_TXT):
sync_os.mknod(self.RECORD_TXT)
with open(self.RECORD_TXT, encoding='utf-8') as f:
self.record = set(f.read().split("|"))
async def close_session(self):
await self.client.close()
async def login(self):
data = {
"username": self.USERNAME,
"password": self.PASSWORD,
}
async with aiohttp.ClientSession() as session:
async with session.post(self.URL, json=data) as resp:
if resp.status == 200:
pass
else:
print('no')
res = await resp.json()
self.token = f'JWT {res.get("data", {}).get("token")}'
return self
def __await__(self):
return self.login().__await__()
async def get(self, url):
async with self.client.get(url=url) as resp:
if resp.status == 200:
pass
else:
print('no')
res = await resp.json()
if not res.get("data"):
print(url, res)
raise BaseException(str(res))
return res
async def big_list(self, page, size):
"""外层列表"""
url = f"{self.BASE_URL}/api/ojj/oj/admin/practice/operation/list/?per_page={size}&page={page}"
res = await self.get(url)
big_list = []
for one in res.get("data", {}).get("list", []):
name: str = one.get("title")
print(name)
resource_id = one.get("resource_id")
url = f'{self.BASE_URL}/api/ojj/oj/admin/practice/operation/?resource_id={resource_id}'
big_list.append({"url": url, "name": f'{name}-{one.get("practice_set_name")}'})
return big_list
async def participant_list(self, url, name):
"""内层列表"""
res = await self.get(url)
data = res.get("data", {})
pic_url_list = []
file_data = {}
md = {}
old_name = data["title"]
data["title"] = name
difficulty = {
0: "简单",
1: "一般",
2: "较难",
3: "困难",
}
self.questions.append([
data.get("title", "").split("-")[0],
old_name,
json.loads(data.get("answer"))[0],
difficulty.get(data.get("difficulty")),
"|".join(json.loads(data.get("knowledge"))),
str(bool(data.get("scene_config_data")))])
question = {
"title": data.get("title"),
"answer": data.get("answer"),
"difficulty": data.get("difficulty"),
"origin_data": data,
}
if data.get("files"):
for file in data.get("files"):
file_data = {
"file_dir": data.get("title").replace("/", "_"),
"file_name": file.get("file_name"),
"file_url": urljoin(self.BASE_URL, file.get("file_url"))
}
if data.get("writeup"):
md_render = misaka.Markdown(misaka.HtmlRenderer())
html = md_render(data.get("writeup"))
soup = BeautifulSoup(html, features="html.parser")
md = {
"name": data.get("title"),
"content": data.get("writeup").replace("/cpms/oj/practice_attachment/", "pic")
}
for raw in soup.find_all("img"):
pic_url: str = raw.get("src", "")
if not pic_url:
continue
pic_name = pic_url.split("/")[-1]
all_pic_url = urljoin(self.BASE_URL, pic_url)
pic_url_list.append({
"pic_url": all_pic_url,
"file_name": pic_name,
"list_name": data.get("title"),
})
self.resource_url.append({
"question": question,
"file": file_data,
"pic": pic_url_list,
"md": md,
"scene": bool(data.get("scene_config_data"))
})
async def save_json(self, middle_dir, file_name, content):
if f'{middle_dir}/{file_name}' not in self.record:
dir = f"resources_qs/{middle_dir}".replace(" ", "_")
if not await os.path.exists(dir):
try:
await os.makedirs(dir)
except Exception as e:
pass
file = f"{dir}/{file_name}"
if await os.path.exists(file):
try:
await os.remove(file)
except Exception as e:
pass
async with aiofiles.open(file, mode="w") as f:
await f.write(str(content))
else:
print(f'{file_name}已经保存')
@with_retries()
async def download(self, url, middle_dir, file_name):
if url and url.startswith(f'{self.BASE_URL}/'):
if f'{middle_dir}/{file_name}' not in self.record:
dir = f"resources_qs/{middle_dir}".replace(" ", "_")
if not await os.path.exists(dir):
try:
await os.makedirs(dir)
except Exception as e:
pass
file = f"{dir}/{file_name}"
if await os.path.exists(file):
try:
await os.remove(file)
except Exception as e:
pass
async with self.client.get(url=url) as response:
async with aiofiles.open(file, mode="wb",) as f:
async for chunk in response.content.iter_chunked(1024):
await f.write(chunk)
# await f.write(await response.content.read())
print(f'end {file_name}')
async with aiofiles.open(self.RECORD_TXT, mode="a") as f_record:
await f_record.write(f'{middle_dir}/{file_name}|')
else:
print(f"{file_name} 已经下载跳过")
@with_retries()
async def zip_download(self, url, middle_dir, file_name):
if url and url.startswith("http"):
dir = f"resources_sn/{middle_dir}".replace(" ", "_")
if not await os.path.exists(dir):
try:
await os.makedirs(dir)
except Exception as e:
pass
file = f"{dir}/{file_name}"
if await os.path.exists(file):
temp_size = await os.path.getsize(file)
else:
temp_size = 0
async with self.client.get(url=url) as resp:
total_size = int(resp.headers.get("content-length", 0))
if temp_size >= total_size:
print(f"{file_name} 下载完成")
return
headers = {"Range": f'bytes={temp_size}-{total_size}'}
async with self.client.get(url=url, headers=headers) as response:
async with aiofiles.open(file, mode="ab+",) as f:
with tqdm(initial=temp_size,
desc=f"{self.cjkrjust(file_name, 40)}",
total=total_size,
unit='',
unit_scale=True,
unit_divisor=1024,
colour=self.random_color()) as bar:
async for chunk in response.content.iter_chunked(1024):
await f.write(chunk)
bar.update(len(chunk))
def random_color(self):
colorArr = ['1', '2', '3', '4', '5', '6', '7', '8', '9',
'A', 'B', 'C', 'D', 'E', 'F']
color = ""
for i in range(6):
color += colorArr[random.randint(0, 14)]
return "#"+color
def count_cjk_chars(self, string):
return sum(unicodedata.east_asian_width(c) in 'FW' for c in string)
def cjkrjust(self, string, width, fillbyte=' '):
"""
右对齐
"""
return string.rjust(width - self.count_cjk_chars(string), fillbyte)
async def save_md(self, middle_dir, file_name, content):
if file_name not in self.record:
dir = f"resources_qs/{middle_dir}".replace(" ", "_")
if not await os.path.exists(dir):
try:
await os.makedirs(dir)
except Exception as e:
pass
file = f"{dir}/{file_name}"
if await os.path.exists(file):
try:
await os.remove(file)
except Exception as e:
pass
async with aiofiles.open(file, mode="w") as f:
await f.write(content)
async with aiofiles.open(self.RECORD_TXT, mode="a") as f_record:
await f_record.write(f'{file_name}|')
else:
print(f'{file_name} 已经保存跳过')
async def save_csv(self, data):
async with aiofiles.open("question.csv", 'w', encoding='utf-8') as f:
csv_writer = csv.writer(f)
# 调用类中的方法:对象.方法名()
csv_writer.writerows([_ for _ in data])
async def main():
sn = await SN()
sn.get_session()
print(sn.token)
big_list = await sn.big_list(1, 200)
await asyncio.gather(*[sn.participant_list(
data.get("url"), f'{index}-{data.get("name")}'
) for index, data in enumerate(big_list)])
await sn.save_csv(sn.questions)
tasks = []
for one in sn.resource_url:
if one.get("scene"):
scene_dir = "scene_yes"
else:
scene_dir = "scene_no"
question = one.get("question")
tasks.append(sn.save_json(
f'{scene_dir}/{question.get("title")}',
f'{question.get("title")}.json',
question.get("origin_data")))
file_data = one.get("file")
if file_data:
tasks.append(sn.zip_download(
file_data.get("file_url"),
f'{scene_dir}/{file_data.get("file_dir")}',
file_data.get("file_name")
))
md = one.get("md")
if md:
tasks.append(sn.save_md(
f'{scene_dir}/{md.get("name")}',
f'{md.get("name")}.md',
md.get("content")
))
for two in one.get("pic", []):
tasks.append(sn.download(
two.get("pic_url"),
f'{scene_dir}/{two.get("list_name")}/pic',
two.get("file_name")
))
await asyncio.gather(*tasks)
await sn.close_session()
print("----------------------end------------------------")
asyncio.run(main())