看漫画学python下载_漫画批量下载

#更新记录20180630----------

1。添加存在文件跳过逻辑

2。非法文件路径名字去除逻辑的添加

# -*- coding: utf-8 -*-

# @Time : 2018/11/16 10:02 AM

# @Author : cxa

# @File : cosmic.py

# @Software: PyCharm

# !/usr/bin/env python

# encoding: utf-8

# !/usr/bin/env python

# encoding: utf-8

from requests_html import HTMLSession

import aiohttp

import asyncio

import hashlib

import os

from traceback import format_exc

import base64

from cryptography.fernet import Fernet

# 文件下载也要是异步

import aiofiles

import multiprocessing

from tomorrow import threads

from retrying import retry

workers = multiprocessing.cpu_count() * 2 + 1

# 开始索引数

strat_num = 227002

# 结束索引数

end_num = 250606

key = "X0JxSkg4NFVBQVBPODlUM0VzT1liNnloeWtLcndkSldRT2xURzQ4MEM5RT0="

page_num_xpath = "//p[@class='selectpage']/select[@id='page_number']/option[last()]/@file"

page_id_xpath = "//img[@id='thumbnail']/@src"

def aes_cbc_decrypt(message):

decrypted_text = Fernet(base64.b64decode(key).decode("utf8")).decrypt(bytes("{}".format(message), encoding="utf8"))

return decrypted_text.decode("utf8")

# 漫画题目

cosmic_name = "//head//title/text()"

# 漫画id

cosmic_id = "//img[@id='curPic']/@src"

main_url = aes_cbc_decrypt(

"gAAAAABbNdhqCnxkaJwZ2VL7HUXne_IOic-NsHtE30W-J68oecVmgm0dzO_lLXgTlI7a5_NbUWlkGm7FqLwY81XIBddNWbac4rCgBA9NFAECsNISkhTvdRl4uDSaS6bHY8sbcJJwO13Z")

cosmic_urllist = [main_url.format(i) for i in range(strat_num, end_num + 1)]

pagenum_xpath = "//font[@id='TotalPage']/text()"

full_url = aes_cbc_decrypt(

"gAAAAABbNdk5FLeX55hOiDAXxgCwwYmGrokYvU3Nd1AOYuOE7OdIEcBdAmSG_Q3kOltealBKMOgUBKDuPUJtzFFPwqoxL-FUip"

"VNQU-JmBW_K5qxgzTQ3IOla_F61Rscy0fJOaN-mEXKPqrakctyDRN7OVm1LARTMhylQELLuBnJgIT4WXilchg=") # 漫画的总id,序号的id和格式使用(jpg)

session = HTMLSession()

sema = asyncio.Semaphore(5)

session = HTMLSession()

async def getbuff(url, c_name):

async with aiohttp.ClientSession() as session2:

async with session2.get(url, timeout=60) as r:

buff = await r.read()

if not len(buff):

url = url.replace(".jpg", ".png")

async with session2.get(url, timeout=60) as r2:

buff = await r2.read()

await getimg(url, buff, c_name)

async def run(url, c_name):

with (await sema):

await getbuff(url, c_name)

#

@threads(30)

@retry(stop_max_attempt_number=5)

def asyc_get_req(url):

req = session.get(url, timeout=15)

if req.status_code == 200:

return req

else:

raise ValueError("访问出错")

def spider(req):

try:

if req.status_code == 200:

root = req.html

name = root.xpath(cosmic_name)[0]

print(name)

with open("1.html", 'a', encoding='utf-8') as fs:

fs.write(req.text)

id = root.xpath(page_id_xpath)[0].split('/')[-2]

max_page = root.xpath(page_num_xpath)[0].split('.')[0]

full_urllist = [full_url.format(id, i, "jpg") for i in range(1, int(max_page) + 1)]

event_loop = asyncio.get_event_loop()

tasks = [run(url, name) for url in full_urllist]

results = event_loop.run_until_complete(asyncio.wait(tasks))

except:

print(format_exc())

async def getimg(url, buff, c_name):

# 题目那层目录

filepath = os.path.join(os.getcwd(), "comics_images", c_name)

# 如果标题太长就转md5,然后单独启动一个text写入内容为标题

md5name = hashlib.md5(c_name.encode("utf-8")).hexdigest()

filepath2 = os.path.join(os.getcwd(), "comics_images", md5name)

id = url.split('/')[-1]

image_id = os.path.join(filepath, id)

image_id2 = os.path.join(filepath2, md5name)

# 题目层目录是否存在

if not os.path.exists(filepath) and not os.path.exists(filepath2):

try:

os.makedirs(filepath)

except:

os.makedirs(filepath2)

image_id = image_id2

fs = await aiofiles.open(os.path.join(filepath2, "title.txt"), 'w')

await fs.write(c_name)

# 文件是否存在

if not os.path.exists(image_id) and not os.path.exists(image_id2):

f = await aiofiles.open(image_id, 'wb')

await f.write(buff)

if __name__ == '__main__':

# with ThreadPool(workers) as pool:

# pool.map(spider, cosmic_urllist)

req_list = []

for url in cosmic_urllist:

req = asyc_get_req(url)

req_list.append(req)

for req in req_list:

spider(req)

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值