Python 3 爬漫画

Python 3 爬漫画

本文章仅供大家学习与交流,严禁用于商业用途,请于24小时内删除下载内容

步骤

  1. 找到图片链接

  2. 确定图片的位置

  3. 观察js的执行过程

  4. 获取js的返回值

  5. 还原图片链接

  6. 获取第一回漫画链接

  7. 循环下载下一回漫画直到最后

分析

F12 + F5 点 network 再点 Img找到要爬的图片地址,再点Elements找到图片标签的id

找到要爬的图片
view-source:没有图片链接,发现一段混淆的js,复制到Console执行
在这里插入图片描述
提示没有splic方法,原网页Console执行查看复制内容

"".splic

在这里插入图片描述
Console添加splic方法后在执行刚才混淆的代码,又出错

VM55:1 Uncaught ReferenceError: SMH is not defined at eval (eval at <anonymous> (:1:27), <anonymous>:1:1) at eval (<anonymous>) at <anonymous>:1:27,
这时只需去掉window["\x65\x76\x61\x6c"]执行,对比一下图片地址没错,已经看到图片地址的分片了.

var LZString=(function(){var f=String.fromCharCode;var keyStrBase64="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";var baseReverseDic={};function getBaseValue(alphabet,character){if(!baseReverseDic[alphabet]){baseReverseDic[alphabet]={};for(var i=0;i<alphabet.length;i++){baseReverseDic[alphabet][alphabet.charAt(i)]=i}}return baseReverseDic[alphabet][character]}var LZString={decompressFromBase64:function(input){if(input==null)return"";if(input=="")return null;return LZString._0(input.length,32,function(index){return getBaseValue(keyStrBase64,input.charAt(index))})},_0:function(length,resetValue,getNextValue){var dictionary=[],next,enlargeIn=4,dictSize=4,numBits=3,entry="",result=[],i,w,bits,resb,maxpower,power,c,data={val:getNextValue(0),position:resetValue,index:1};for(i=0;i<3;i+=1){dictionary[i]=i}bits=0;maxpower=Math.pow(2,2);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}switch(next=bits){case 0:bits=0;maxpower=Math.pow(2,8);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}c=f(bits);break;case 1:bits=0;maxpower=Math.pow(2,16);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}c=f(bits);break;case 2:return""}dictionary[3]=c;w=c;result.push(c);while(true){if(data.index>length){return""}bits=0;maxpower=Math.pow(2,numBits);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}switch(c=bits){case 0:bits=0;maxpower=Math.pow(2,8);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}dictionary[dictSize++]=f(bits);c=dictSize-1;enlargeIn--;break;case 1:bits=0;maxpower=Math.pow(2,16);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}dictionary[dictSize++]=f(bits);c=dictSize-1;enlargeIn--;break;case 2:return result.join('')}if(enlargeIn==0){enlargeIn=Math.pow(2,numBits);numBits++}if(dictionary[c]){entry=dictionary[c]}else{if(c===dictSize){entry=w+w.charAt(0)}else{return null}}result.push(entry);dictionary[dictSize++]=w+entry.charAt(0);enlargeIn--;w=entry;if(enlargeIn==0){enlargeIn=Math.pow(2,numBits);numBits++}}}};return LZString})();String.prototype.splic=function(f){return LZString.decompressFromBase64(this).split(f)};

在这里插入图片描述

编码

开始编写代码请安装导入
第三方依赖requests js2py lxml ,并且导入自带的re, pathlib, json

pip install requests js2py lxml

利用 requests 请求源网页内容,再用正则提取混淆的 js 代码,之后用 js2py 的 eval_js 执行,
返回的结果(是js代码最后一个表达式的值)切片得到 json 字符串,然后用 json.loads 反序列化为dict对象

	def get_imgData(self):
		r = requests.get(self.url, headers=self.headers)
		script = re.search('(\(function.*?)</script>', r.text).group(1)
		#print('js内容',script)
		jsonStr = eval_js(self.LzString+script)[12:-12]
		#print('json字符串',jsonStr)
		imgData = loads(jsonStr)#json字符串序列化为dict对象
		return imgData

根据imgData字典还原原图片连接,同时生成下载路径

	def get_imgUrl(self):
		urlBase = "https://us.hamreus.com" + self.imgData['path']
		info = "?cid=%d&md5="%(self.imgData['cid']) + self.imgData['sl']['md5']
		for i, f in enumerate(self.imgData['files']):
			page_num = i + 1
			downPath = self.basePath + '\\%02d-'%page_num + f
			imgUrl = urlBase + f + info
			yield (downPath,imgUrl)#元组放到生成器对象

用request下载图片,pathlib.Path递归创建目录并且判断是否下载的文件已经存在

	def downImg(self):
		length = self.imgData['len']
		if length == 0: 
			return '根据相关法律法规,该漫画无法下载'
		Path(self.basePath).mkdir(parents=True, exist_ok=True)#递归创建目录
		i = 0
		print('\n开始下载', self.imgData['cname'])
		for path,url in self.imgNameUrl:#取出生成器的元组
			i += 1
			print('...%.2f%%'%(100*i/length), end='', flush=True)
			if Path(path).exists(): continue #检查文件是否下载了
			res = requests.get(url, headers=self.headers)
			if res.status_code == 200:
				with open(path,'wb') as file:
					file.write(res.content)
			else:
				print(res.text,'Download Failure:',url)
		return self.basePath

直接浏览器访问图片,提示403 Forbidden,对比一开始获取图片连接的request headers 发现少了Referer,加上后requests正常访问
在这里插入图片描述
用lxml.html.fromstring解析网页,获取第一回漫画地址,然后根据imgData获取下一回漫画地址nextId,直到最后一话.

def main(url,headers):
	if re.match('https://www\.manhuagui\.com/comic/\d+\/$',url):
		res = requests.get(url, headers=headers)
		doc=html.fromstring(res.text)
		href = doc.cssselect('a.btn-read')[0].get('href')#获取第一回漫画地址
		volUrl = 'https://www.manhuagui.com'+ href
		first = Volume(volUrl,headers)
		print('准备下载', first.imgData['bname'])
		first.downImg()
		nextId = first.imgData['nextId']#获取下一回漫画地址id
		while nextId != 0:
			volUrl = volUrl.replace(str(first.imgData['cid']) + '.', str(nextId) + '.')
			first = Volume(volUrl,headers)
			first.downImg()
			nextId = first.imgData['nextId']
		print('\n文件保存在',first.downImg())

至此分析完毕
下面给出完整代码

import requests,re
from lxml import html
from js2py import eval_js
from json import loads
from pathlib import Path
'''
第三方依赖requests js2py lxml
pip install requests js2py lxml
'''
class Volume:	
	LzString = '''
	var LZString=(function(){var f=String.fromCharCode;var keyStrBase64="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";var baseReverseDic={};function getBaseValue(alphabet,character){if(!baseReverseDic[alphabet]){baseReverseDic[alphabet]={};for(var i=0;i
	<alphabet.length;i++){baseReverseDic[alphabet][alphabet.charAt(i)]=i}}return baseReverseDic[alphabet][character]}var LZString={decompressFromBase64:function(input){if(input==null)return"";if(input=="")return null;return LZString._0(input.length,32,function(index){return getBaseValue(keyStrBase64,input.charAt(index))})},_0:function(length,resetValue,getNextValue){var dictionary=[],next,enlargeIn=4,dictSize=4,numBits=3,entry="",result=[],i,w,bits,resb,maxpower,power,c,data={val:getNextValue(0),position:resetValue,index:1};for(i=0;i
	<3;i+=1){dictionary[i]=i}bits=0;maxpower=Math.pow(2,2);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}switch(next=bits){case 0:bits=0;maxpower=Math.pow(2,8);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}c=f(bits);break;case 1:bits=0;maxpower=Math.pow(2,16);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}c=f(bits);break;case 2:return""}dictionary[3]=c;w=c;result.push(c);while(true){if(data.index>length){return""}bits=0;maxpower=Math.pow(2,numBits);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}switch(c=bits){case 0:bits=0;maxpower=Math.pow(2,8);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}dictionary[dictSize++]=f(bits);c=dictSize-1;enlargeIn--;break;case 1:bits=0;maxpower=Math.pow(2,16);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}dictionary[dictSize++]=f(bits);c=dictSize-1;enlargeIn--;break;case 2:return result.join('')}if(enlargeIn==0){enlargeIn=Math.pow(2,numBits);numBits++}if(dictionary[c]){entry=dictionary[c]}else{if(c===dictSize){entry=w+w.charAt(0)}else{return null}}result.push(entry);dictionary[dictSize++]=w+entry.charAt(0);enlargeIn--;w=entry;if(enlargeIn==0){enlargeIn=Math.pow(2,numBits);numBits++}}}};return LZString})();
	String.prototype.splic=function(f){return LZString.decompressFromBase64(this).split(f)};
	'''
	
	
	
	def __init__(self,url,headers):
		self.url = url
		self.headers = headers
		self.imgData = self.get_imgData()
		self.basePath = 'F:\\Download\\' + self.imgData['bname'] + '\\' + self.imgData['cname'] 
		self.imgNameUrl = self.get_imgUrl()
	
	def get_imgData(self):
		r = requests.get(self.url, headers=self.headers)
		#search第一个匹配的结果
		script = re.search('(\(function.*?)</script>', r.text).group(1)
		#print('js内容',script)
		jsonStr = eval_js(self.LzString+script)[12:-12]
		#print('json字符串',jsonStr)
		imgData = loads(jsonStr)#json字符串反序列化为dict对象
		return imgData
	
	def get_imgUrl(self):
		urlBase = "https://us.hamreus.com" + self.imgData['path']
		info = "?cid=%d&md5="%(self.imgData['cid']) + self.imgData['sl']['md5']
		for i, f in enumerate(self.imgData['files']):
			page_num = i + 1
			downPath = self.basePath + '\\%02d-'%page_num + f
			imgUrl = urlBase + f + info
			yield (downPath,imgUrl)#元组放到生成器对象

		
	def downImg(self):
		length = self.imgData['len']
		if length == 0: 
			return '根据相关法律法规,该漫画无法下载'
		Path(self.basePath).mkdir(parents=True, exist_ok=True)#递归创建目录,存在不创建
		i = 0
		print('\n开始下载', self.imgData['cname'])
		for path,url in self.imgNameUrl:#取出生成器的元组
			i += 1
			print('...%.2f%%'%(100*i/length), end='', flush=True)
			if Path(path).exists(): continue #检查文件是否下载了
			res = requests.get(url, headers=self.headers)
			if res.status_code == 200:
				with open(path,'wb') as file:
					file.write(res.content)
			else:
				print(res.text,'Download Failure:',url)
		return self.basePath

def main(url,headers):
	if re.match('https://www\.manhuagui\.com/comic/\d+\/$',url):
		res = requests.get(url, headers=headers)
		doc=html.fromstring(res.text)
		href = doc.cssselect('a.btn-read')[0].get('href')#获取第一回漫画地址
		volUrl = 'https://www.manhuagui.com'+ href
		first = Volume(volUrl,headers)
		print('准备下载', first.imgData['bname'])
		first.downImg()
		nextId = first.imgData['nextId']#获取下一回漫画地址id
		while nextId != 0:
			volUrl = volUrl.replace(str(first.imgData['cid']) + '.', str(nextId) + '.')
			first = Volume(volUrl,headers)
			first.downImg()
			nextId = first.imgData['nextId']
		print('\n文件保存在',first.downImg())
	else:
		if re.match('https://www\.manhuagui\.com/comic/\d+/\d+(_p\d+)?\.html$',url):
			first = Volume(url,headers)
			print('准备下载', first.imgData['bname'])
			print('\n文件保存在',first.downImg())
		else: print('请输入正常的连接,\n例如:https://www.manhuagui.com/comic/17023/\n或者 https://www.manhuagui.com/comic/17023/183825.html')
headers = {
	'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
	'Referer':'https://www.manhuagui.com'#解决403 Forbidden
	}				
if __name__ == '__main__':
	while 1:
		comic_uri = input('\n请输入manhuagui.com漫画索引页地址[ctrl + c停止]: ')
		main(comic_uri,headers)
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值