声明:
1.本教程及其相关派生物仅用于个人学习python、研究python或欣赏python,以及其他非商业性或非盈利性用途。故,因使用本教程及其相关派生物而产生的任何风险甚至法律责任均由使用者自己承担。
2.本教程不提供相关网站连接方法。
3.初学Python,不足之处还望各位大佬指出。
运行环境:python3 相关第三方库自行安装
配置文件:
第一行:Cookie
第二行:下载路径
第三行:最小下载延时时间[秒]
第四行:最大下载延时时间[秒]
原理:建立链接列表,设置超时,正则表达式逐步匹配
代码:
from requests.adapters import HTTPAdapter
from fake_useragent import UserAgent
from requests_toolbelt import SSLAdapter
import os,time,random,re
import requests,ssl,chardet
#版本:1.0
#本脚本适用于下载漫画,如果下载的内容页数极其多可能会下不全。
#支持从上次打断处继续下载。多页漫画仅需输入漫画首页链接。
ua = UserAgent()
Cookies = None
TargetPath = None
MinTime = None
MaxTime = None
urlHead='https://exhentai.org/g/'
def ReadConfig():#读配置文件函数
global Cookies,TargetPath,MinTime,MaxTime
path = input('配置文件绝对路径 [也可以拖进来]:')
if not os.path.exists(path):
ReadConfig()
return
try:
fileHandler = open(path,"r")
listOfLines = fileHandler.read().splitlines()
fileHandler.close()
except:
ReadConfig()
return
Cookies = listOfLines[0]
TargetPath = listOfLines[1]
MinTime = int(listOfLines[2])
MaxTime = int(listOfLines[3])
def DlComic(Url,dlPath):#解析链接后下载图片
global ua,Cookies,MinTime,MaxTime
DlPage = requests.Session()
DlPage.mount('https://', HTTPAdapter(max_retries=5))
DlPage.mount('https://', SSLAdapter(ssl.PROTOCOL_TLSv1_2))
response = DlPage.get(Url,headers = {'Cookie':Cookies,'User-Agent':ua.random},timeout = 5).content
encode_type = chardet.detect(response)
pageStr = response.decode(encode_type['encoding'])
imgTUrl =str(re.findall(r'(?<=
time.sleep(random.uniform(MinTime,MaxTime)) #延时
DlImg = requests.Session()
DlImg.mount('https://', HTTPAdapter(max_retries=5))
DlImg.mount('https://', SSLAdapter(ssl.PROTOCOL_TLSv1_2))
response = DlImg.get(imgTUrl,headers = {'User-Agent':ua.random},timeout = 120)
with open(dlPath, 'wb') as f:
f.write(response.content)
f.flush()
print('succeed')
def getImgList(Urls,title):#获取图片链接列表
global TargetPath,ua,Cookies
IsError = False
imgList = []
for Url in Urls:
DlPage = requests.Session()
DlPage.mount('https://', HTTPAdapter(max_retries=5))
DlPage.mount('https://', SSLAdapter(ssl.PROTOCOL_TLSv1_2))
page = DlPage.get(Url,headers = {'Cookie':Cookies,'User-Agent':ua.random},timeout = 5).content
encode_type = chardet.detect(page)
pageStr = page.decode(encode_type['encoding'])
imgUrls = re.findall(r'https\:\/\/exhentai\.org\/s\/\w{1,}\/[\w\-]{1,}',pageStr)
imgList.extend(imgUrls)
Numl = len(imgList)
print('共' + str(Numl) + '页')
dlDir = TargetPath + title + '\\'
if not os.path.exists(dlDir):
os.mkdir(dlDir)
for item in range(int(Numl)):
print(' ' + str(item+1) + "/" + str(Numl) + " >>> " + imgList[item],' >>> ',end = '')
try:
if not os.path.exists(dlDir + str(item) + '.jpg'):
DlComic(imgList[item] , dlDir + str(item) + '.jpg' )
else:
print('done')
except Exception as e:
print('Error' + str(e))
IsError = True
for i in range(10):
try:
if IsError:
print(' ' + str(item+1) + " ReDl >>> " + str(i) + ' >>> ',end = '')
DlComic(imgList[item] , dlDir + str(item) + '.jpg' )
IsError = False
except:
IsError = True
def getPageList(Url):#解析漫画首页并获取’所有‘页面链接
global ua,Cookies
if (Url.find('https://exhentai.org/g/') != -1):
DlPage = requests.Session()
DlPage.mount('https://', HTTPAdapter(max_retries=5))
page = DlPage.get(Url,headers = {'Cookie':Cookies,'User-Agent':ua.random}).content
encode_type = chardet.detect(page)#2
page = page.decode(encode_type['encoding'])
subPageUrls = re.findall(r'https\:\/\/exhentai\.org\/g\/\w{1,}\/[\w\-]{1,}\/\?p=\d{1,}\" οnclick=\"return false\">\d{1,}',page)
subPageUrls = re.findall(r'https\:\/\/exhentai\.org\/g\/\w{1,}\/[\w\-]{1,}\/\?p=\d{1,}', "','".join(subPageUrls))
subPageUrls = subPageUrls[0:int(len(subPageUrls)/2)]
subPageUrls.insert(0,Url)
titleStr = str(re.findall(r'(?<=
).*?(?=)',page)[0])titleStr = titleStr.replace("|","").replace("*","").replace("?","").replace("","").replace(":","").replace("\\","").replace("/","").replace("&","").replace(";","")
if len(titleStr)>255:
titleStr = titleStr[0:255]
print(' >>> 解析>' + titleStr)
getImgList(subPageUrls,titleStr)
else:
print('>>> 错误的里站链接 '+Url)
def main():#读配置文件,输入并备份要下载漫画的首页链接
print('\nExhentai Downloader >>> 开发者:Suni_ing\n免责声明:\n本脚本提供的内容仅用于个人学习python、研究python或欣赏python,以及其他非商业性或非盈利性用途。\n故,因使用本脚本而产生的任何风险甚至法律责任均由使用者自己承担。\n')
print('关于配置文件:\n第一行:Cookie\n第二行:下载路径\n第三行:最小下载延时时间[秒]\n第四行:最大下载延时时间[秒]\n')
ReadConfig()
urls = []
inpStr = '>>> 请输入链接 >>> 输入空白内容结束 >>> '
url = input(inpStr)
while url != "":
urls.append(url)
url = input('>>> 链接列表共 '+ str(len(urls)) + inpStr)
print('>>> 输入结束')
f = open("url.txt",'a')
f.write('--->>>\n')
for item in urls:
f.write(item + '\n')
f.close()
if(len(urls) > 0):
for item in range(len(urls)):
print(str(item+1) + '/' + str(len(urls)),end = '')
getPageList(urls[item])
else:
print('>>> 运行结束',end='')
main()
main()