python教程it教程网_Python教程 | 里站爬虫

最新推荐文章于 2024-04-03 21:20:27 发布

weixin_39517202

最新推荐文章于 2024-04-03 21:20:27 发布

阅读量202

点赞数

文章标签： python教程it教程网

本文链接：https://blog.csdn.net/weixin_39517202/article/details/111450587

版权

声明：

1.本教程及其相关派生物仅用于个人学习python、研究python或欣赏python，以及其他非商业性或非盈利性用途。故，因使用本教程及其相关派生物而产生的任何风险甚至法律责任均由使用者自己承担。

2.本教程不提供相关网站连接方法。

3.初学Python，不足之处还望各位大佬指出。

运行环境：python3 相关第三方库自行安装

配置文件：

第一行:Cookie

第二行:下载路径

第三行:最小下载延时时间[秒]

第四行:最大下载延时时间[秒]

原理：建立链接列表，设置超时，正则表达式逐步匹配

代码：

from requests.adapters import HTTPAdapter

from fake_useragent import UserAgent

from requests_toolbelt import SSLAdapter

import os,time,random,re

import requests,ssl,chardet

#版本:1.0

#本脚本适用于下载漫画，如果下载的内容页数极其多可能会下不全。

#支持从上次打断处继续下载。多页漫画仅需输入漫画首页链接。

ua = UserAgent()

Cookies = None

TargetPath = None

MinTime = None

MaxTime = None

urlHead='https://exhentai.org/g/'

def ReadConfig():#读配置文件函数

global Cookies,TargetPath,MinTime,MaxTime

path = input('配置文件绝对路径 [也可以拖进来]:')

if not os.path.exists(path):

ReadConfig()

return

try:

fileHandler = open(path,"r")

listOfLines = fileHandler.read().splitlines()

fileHandler.close()

except:

ReadConfig()

return

Cookies = listOfLines[0]

TargetPath = listOfLines[1]

MinTime = int(listOfLines[2])

MaxTime = int(listOfLines[3])

def DlComic(Url,dlPath):#解析链接后下载图片

global ua,Cookies,MinTime,MaxTime

DlPage = requests.Session()

DlPage.mount('https://', HTTPAdapter(max_retries=5))

DlPage.mount('https://', SSLAdapter(ssl.PROTOCOL_TLSv1_2))

response = DlPage.get(Url,headers = {'Cookie':Cookies,'User-Agent':ua.random},timeout = 5).content

encode_type = chardet.detect(response)

pageStr = response.decode(encode_type['encoding'])

imgTUrl =str(re.findall(r'(?<=

time.sleep(random.uniform(MinTime,MaxTime)) #延时

DlImg = requests.Session()

DlImg.mount('https://', HTTPAdapter(max_retries=5))

DlImg.mount('https://', SSLAdapter(ssl.PROTOCOL_TLSv1_2))

response = DlImg.get(imgTUrl,headers = {'User-Agent':ua.random},timeout = 120)

with open(dlPath, 'wb') as f:

f.write(response.content)

f.flush()

print('succeed')

def getImgList(Urls,title):#获取图片链接列表

global TargetPath,ua,Cookies

IsError = False

imgList = []

for Url in Urls:

DlPage = requests.Session()

DlPage.mount('https://', HTTPAdapter(max_retries=5))

DlPage.mount('https://', SSLAdapter(ssl.PROTOCOL_TLSv1_2))

page = DlPage.get(Url,headers = {'Cookie':Cookies,'User-Agent':ua.random},timeout = 5).content

encode_type = chardet.detect(page)

pageStr = page.decode(encode_type['encoding'])

imgUrls = re.findall(r'https\:\/\/exhentai\.org\/s\/\w{1,}\/[\w\-]{1,}',pageStr)

imgList.extend(imgUrls)

Numl = len(imgList)

print('共' + str(Numl) + '页')

dlDir = TargetPath + title + '\\'

if not os.path.exists(dlDir):

os.mkdir(dlDir)

for item in range(int(Numl)):

print(' ' + str(item+1) + "/" + str(Numl) + " >>> " + imgList[item],' >>> ',end = '')

try:

if not os.path.exists(dlDir + str(item) + '.jpg'):

DlComic(imgList[item] , dlDir + str(item) + '.jpg' )

else:

print('done')

except Exception as e:

print('Error' + str(e))

IsError = True

for i in range(10):

try:

if IsError:

print(' ' + str(item+1) + " ReDl >>> " + str(i) + ' >>> ',end = '')

DlComic(imgList[item] , dlDir + str(item) + '.jpg' )

IsError = False

except:

IsError = True

def getPageList(Url):#解析漫画首页并获取’所有‘页面链接

global ua,Cookies

if (Url.find('https://exhentai.org/g/') != -1):

DlPage = requests.Session()

DlPage.mount('https://', HTTPAdapter(max_retries=5))

page = DlPage.get(Url,headers = {'Cookie':Cookies,'User-Agent':ua.random}).content

encode_type = chardet.detect(page)#2

page = page.decode(encode_type['encoding'])

subPageUrls = re.findall(r'https\:\/\/exhentai\.org\/g\/\w{1,}\/[\w\-]{1,}\/\?p=\d{1,}\" οnclick=\"return false\">\d{1,}',page)

subPageUrls = re.findall(r'https\:\/\/exhentai\.org\/g\/\w{1,}\/[\w\-]{1,}\/\?p=\d{1,}', "','".join(subPageUrls))

subPageUrls = subPageUrls[0:int(len(subPageUrls)/2)]

subPageUrls.insert(0,Url)

titleStr = str(re.findall(r'(?<=

).*?(?=)',page)[0])

titleStr = titleStr.replace("|","").replace("*","").replace("?","").replace("","").replace(":","").replace("\\","").replace("/","").replace("&","").replace(";","")

if len(titleStr)>255:

titleStr = titleStr[0:255]

print(' >>> 解析>' + titleStr)

getImgList(subPageUrls,titleStr)

else:

print('>>> 错误的里站链接 '+Url)

def main():#读配置文件，输入并备份要下载漫画的首页链接

print('\nExhentai Downloader >>> 开发者:Suni_ing\n免责声明:\n本脚本提供的内容仅用于个人学习python、研究python或欣赏python，以及其他非商业性或非盈利性用途。\n故，因使用本脚本而产生的任何风险甚至法律责任均由使用者自己承担。\n')

print('关于配置文件:\n第一行:Cookie\n第二行:下载路径\n第三行:最小下载延时时间[秒]\n第四行:最大下载延时时间[秒]\n')

ReadConfig()

urls = []

inpStr = '>>> 请输入链接 >>> 输入空白内容结束 >>> '

url = input(inpStr)

while url != "":

urls.append(url)

url = input('>>> 链接列表共 '+ str(len(urls)) + inpStr)

print('>>> 输入结束')

f = open("url.txt",'a')

f.write('--->>>\n')

for item in urls:

f.write(item + '\n')

f.close()

if(len(urls) > 0):

for item in range(len(urls)):

print(str(item+1) + '/' + str(len(urls)),end = '')

getPageList(urls[item])

else:

print('>>> 运行结束',end='')

main()

weixin_39517202

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫