看漫画学python下载_漫画批量下载

最新推荐文章于 2022-12-20 18:01:34 发布

weixin_39870155

最新推荐文章于 2022-12-20 18:01:34 发布

阅读量716

点赞数

文章标签：看漫画学python下载

#更新记录20180630----------

1。添加存在文件跳过逻辑

2。非法文件路径名字去除逻辑的添加

# -*- coding: utf-8 -*-

# @Time : 2018/11/16 10:02 AM

# @Author : cxa

# @File : cosmic.py

# @Software: PyCharm

# !/usr/bin/env python

# encoding: utf-8

# !/usr/bin/env python

# encoding: utf-8

from requests_html import HTMLSession

import aiohttp

import asyncio

import hashlib

import os

from traceback import format_exc

import base64

from cryptography.fernet import Fernet

# 文件下载也要是异步

import aiofiles

import multiprocessing

from tomorrow import threads

from retrying import retry

workers = multiprocessing.cpu_count() * 2 + 1

# 开始索引数

strat_num = 227002

# 结束索引数

end_num = 250606

key = "X0JxSkg4NFVBQVBPODlUM0VzT1liNnloeWtLcndkSldRT2xURzQ4MEM5RT0="

page_num_xpath = "//p[@class='selectpage']/select[@id='page_number']/option[last()]/@file"

page_id_xpath = "//img[@id='thumbnail']/@src"

def aes_cbc_decrypt(message):

decrypted_text = Fernet(base64.b64decode(key).decode("utf8")).decrypt(bytes("{}".format(message), encoding="utf8"))

return decrypted_text.decode("utf8")

# 漫画题目

cosmic_name = "//head//title/text()"

# 漫画id

cosmic_id = "//img[@id='curPic']/@src"

main_url = aes_cbc_decrypt(

"gAAAAABbNdhqCnxkaJwZ2VL7HUXne_IOic-NsHtE30W-J68oecVmgm0dzO_lLXgTlI7a5_NbUWlkGm7FqLwY81XIBddNWbac4rCgBA9NFAECsNISkhTvdRl4uDSaS6bHY8sbcJJwO13Z")

cosmic_urllist = [main_url.format(i) for i in range(strat_num, end_num + 1)]

pagenum_xpath = "//font[@id='TotalPage']/text()"

full_url = aes_cbc_decrypt(

"gAAAAABbNdk5FLeX55hOiDAXxgCwwYmGrokYvU3Nd1AOYuOE7OdIEcBdAmSG_Q3kOltealBKMOgUBKDuPUJtzFFPwqoxL-FUip"

"VNQU-JmBW_K5qxgzTQ3IOla_F61Rscy0fJOaN-mEXKPqrakctyDRN7OVm1LARTMhylQELLuBnJgIT4WXilchg=") # 漫画的总id，序号的id和格式使用(jpg)

session = HTMLSession()

sema = asyncio.Semaphore(5)

session = HTMLSession()

async def getbuff(url, c_name):

async with aiohttp.ClientSession() as session2:

async with session2.get(url, timeout=60) as r:

buff = await r.read()

if not len(buff):

url = url.replace(".jpg", ".png")

async with session2.get(url, timeout=60) as r2:

buff = await r2.read()

await getimg(url, buff, c_name)

async def run(url, c_name):

with (await sema):

await getbuff(url, c_name)

@threads(30)

@retry(stop_max_attempt_number=5)

def asyc_get_req(url):

req = session.get(url, timeout=15)

if req.status_code == 200:

return req

else:

raise ValueError("访问出错")

def spider(req):

try:

if req.status_code == 200:

root = req.html

name = root.xpath(cosmic_name)[0]

print(name)

with open("1.html", 'a', encoding='utf-8') as fs:

fs.write(req.text)

id = root.xpath(page_id_xpath)[0].split('/')[-2]

max_page = root.xpath(page_num_xpath)[0].split('.')[0]

full_urllist = [full_url.format(id, i, "jpg") for i in range(1, int(max_page) + 1)]

event_loop = asyncio.get_event_loop()

tasks = [run(url, name) for url in full_urllist]

results = event_loop.run_until_complete(asyncio.wait(tasks))

except:

print(format_exc())

async def getimg(url, buff, c_name):

# 题目那层目录

filepath = os.path.join(os.getcwd(), "comics_images", c_name)

# 如果标题太长就转md5，然后单独启动一个text写入内容为标题

md5name = hashlib.md5(c_name.encode("utf-8")).hexdigest()

filepath2 = os.path.join(os.getcwd(), "comics_images", md5name)

id = url.split('/')[-1]

image_id = os.path.join(filepath, id)

image_id2 = os.path.join(filepath2, md5name)

# 题目层目录是否存在

if not os.path.exists(filepath) and not os.path.exists(filepath2):

try:

os.makedirs(filepath)

except:

os.makedirs(filepath2)

image_id = image_id2

fs = await aiofiles.open(os.path.join(filepath2, "title.txt"), 'w')

await fs.write(c_name)

# 文件是否存在

if not os.path.exists(image_id) and not os.path.exists(image_id2):

f = await aiofiles.open(image_id, 'wb')

await f.write(buff)

if __name__ == '__main__':

# with ThreadPool(workers) as pool:

# pool.map(spider, cosmic_urllist)

req_list = []

for url in cosmic_urllist:

req = asyc_get_req(url)

req_list.append(req)

for req in req_list:

spider(req)

weixin_39870155

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
看漫画学python下载_漫画批量下载

#更新记录20180630----------1。添加存在文件跳过逻辑2。非法文件路径名字去除逻辑的添加# -*- coding: utf-8 -*-# @Time : 2018/11/16 10:02 AM# @Author : cxa# @File : cosmic.py# @Software: PyCharm# !/usr/bin/env python# encoding: utf-8# !...
复制链接

扫一扫