Python 3 爬漫画
本文章仅供大家学习与交流,严禁用于商业用途,请于24小时内删除下载内容
步骤
-
找到图片链接
-
确定图片的位置
-
观察js的执行过程
-
获取js的返回值
-
还原图片链接
-
获取第一回漫画链接
-
循环下载下一回漫画直到最后
分析
F12 + F5 点 network 再点 Img找到要爬的图片地址,再点Elements找到图片标签的id
view-source:没有图片链接,发现一段混淆的js,复制到Console执行
提示没有splic方法,原网页Console执行查看复制内容
"".splic
Console添加splic方法后在执行刚才混淆的代码,又出错
VM55:1 Uncaught ReferenceError: SMH is not defined at eval (eval at <anonymous> (:1:27), <anonymous>:1:1) at eval (<anonymous>) at <anonymous>:1:27
,
这时只需去掉window["\x65\x76\x61\x6c"]
执行,对比一下图片地址没错,已经看到图片地址的分片了.
var LZString=(function(){var f=String.fromCharCode;var keyStrBase64="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";var baseReverseDic={};function getBaseValue(alphabet,character){if(!baseReverseDic[alphabet]){baseReverseDic[alphabet]={};for(var i=0;i<alphabet.length;i++){baseReverseDic[alphabet][alphabet.charAt(i)]=i}}return baseReverseDic[alphabet][character]}var LZString={decompressFromBase64:function(input){if(input==null)return"";if(input=="")return null;return LZString._0(input.length,32,function(index){return getBaseValue(keyStrBase64,input.charAt(index))})},_0:function(length,resetValue,getNextValue){var dictionary=[],next,enlargeIn=4,dictSize=4,numBits=3,entry="",result=[],i,w,bits,resb,maxpower,power,c,data={val:getNextValue(0),position:resetValue,index:1};for(i=0;i<3;i+=1){dictionary[i]=i}bits=0;maxpower=Math.pow(2,2);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}switch(next=bits){case 0:bits=0;maxpower=Math.pow(2,8);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}c=f(bits);break;case 1:bits=0;maxpower=Math.pow(2,16);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}c=f(bits);break;case 2:return""}dictionary[3]=c;w=c;result.push(c);while(true){if(data.index>length){return""}bits=0;maxpower=Math.pow(2,numBits);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}switch(c=bits){case 0:bits=0;maxpower=Math.pow(2,8);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}dictionary[dictSize++]=f(bits);c=dictSize-1;enlargeIn--;break;case 1:bits=0;maxpower=Math.pow(2,16);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}dictionary[dictSize++]=f(bits);c=dictSize-1;enlargeIn--;break;case 2:return result.join('')}if(enlargeIn==0){enlargeIn=Math.pow(2,numBits);numBits++}if(dictionary[c]){entry=dictionary[c]}else{if(c===dictSize){entry=w+w.charAt(0)}else{return null}}result.push(entry);dictionary[dictSize++]=w+entry.charAt(0);enlargeIn--;w=entry;if(enlargeIn==0){enlargeIn=Math.pow(2,numBits);numBits++}}}};return LZString})();String.prototype.splic=function(f){return LZString.decompressFromBase64(this).split(f)};
编码
开始编写代码请安装导入
第三方依赖requests js2py lxml ,并且导入自带的re, pathlib, json
pip install requests js2py lxml
利用 requests 请求源网页内容,再用正则提取混淆的 js 代码,之后用 js2py 的 eval_js 执行,
返回的结果(是js代码最后一个表达式的值)切片得到 json 字符串,然后用 json.loads 反序列化为dict对象
def get_imgData(self):
r = requests.get(self.url, headers=self.headers)
script = re.search('(\(function.*?)</script>', r.text).group(1)
#print('js内容',script)
jsonStr = eval_js(self.LzString+script)[12:-12]
#print('json字符串',jsonStr)
imgData = loads(jsonStr)#json字符串序列化为dict对象
return imgData
根据imgData字典还原原图片连接,同时生成下载路径
def get_imgUrl(self):
urlBase = "https://us.hamreus.com" + self.imgData['path']
info = "?cid=%d&md5="%(self.imgData['cid']) + self.imgData['sl']['md5']
for i, f in enumerate(self.imgData['files']):
page_num = i + 1
downPath = self.basePath + '\\%02d-'%page_num + f
imgUrl = urlBase + f + info
yield (downPath,imgUrl)#元组放到生成器对象
用request下载图片,pathlib.Path递归创建目录并且判断是否下载的文件已经存在
def downImg(self):
length = self.imgData['len']
if length == 0:
return '根据相关法律法规,该漫画无法下载'
Path(self.basePath).mkdir(parents=True, exist_ok=True)#递归创建目录
i = 0
print('\n开始下载', self.imgData['cname'])
for path,url in self.imgNameUrl:#取出生成器的元组
i += 1
print('...%.2f%%'%(100*i/length), end='', flush=True)
if Path(path).exists(): continue #检查文件是否下载了
res = requests.get(url, headers=self.headers)
if res.status_code == 200:
with open(path,'wb') as file:
file.write(res.content)
else:
print(res.text,'Download Failure:',url)
return self.basePath
直接浏览器访问图片,提示403 Forbidden,对比一开始获取图片连接的request headers 发现少了Referer,加上后requests正常访问
用lxml.html.fromstring解析网页,获取第一回漫画地址,然后根据imgData获取下一回漫画地址nextId,直到最后一话.
def main(url,headers):
if re.match('https://www\.manhuagui\.com/comic/\d+\/$',url):
res = requests.get(url, headers=headers)
doc=html.fromstring(res.text)
href = doc.cssselect('a.btn-read')[0].get('href')#获取第一回漫画地址
volUrl = 'https://www.manhuagui.com'+ href
first = Volume(volUrl,headers)
print('准备下载', first.imgData['bname'])
first.downImg()
nextId = first.imgData['nextId']#获取下一回漫画地址id
while nextId != 0:
volUrl = volUrl.replace(str(first.imgData['cid']) + '.', str(nextId) + '.')
first = Volume(volUrl,headers)
first.downImg()
nextId = first.imgData['nextId']
print('\n文件保存在',first.downImg())
至此分析完毕
下面给出完整代码
import requests,re
from lxml import html
from js2py import eval_js
from json import loads
from pathlib import Path
'''
第三方依赖requests js2py lxml
pip install requests js2py lxml
'''
class Volume:
LzString = '''
var LZString=(function(){var f=String.fromCharCode;var keyStrBase64="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/=";var baseReverseDic={};function getBaseValue(alphabet,character){if(!baseReverseDic[alphabet]){baseReverseDic[alphabet]={};for(var i=0;i
<alphabet.length;i++){baseReverseDic[alphabet][alphabet.charAt(i)]=i}}return baseReverseDic[alphabet][character]}var LZString={decompressFromBase64:function(input){if(input==null)return"";if(input=="")return null;return LZString._0(input.length,32,function(index){return getBaseValue(keyStrBase64,input.charAt(index))})},_0:function(length,resetValue,getNextValue){var dictionary=[],next,enlargeIn=4,dictSize=4,numBits=3,entry="",result=[],i,w,bits,resb,maxpower,power,c,data={val:getNextValue(0),position:resetValue,index:1};for(i=0;i
<3;i+=1){dictionary[i]=i}bits=0;maxpower=Math.pow(2,2);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}switch(next=bits){case 0:bits=0;maxpower=Math.pow(2,8);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}c=f(bits);break;case 1:bits=0;maxpower=Math.pow(2,16);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}c=f(bits);break;case 2:return""}dictionary[3]=c;w=c;result.push(c);while(true){if(data.index>length){return""}bits=0;maxpower=Math.pow(2,numBits);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}switch(c=bits){case 0:bits=0;maxpower=Math.pow(2,8);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}dictionary[dictSize++]=f(bits);c=dictSize-1;enlargeIn--;break;case 1:bits=0;maxpower=Math.pow(2,16);power=1;while(power!=maxpower){resb=data.val&data.position;data.position>>=1;if(data.position==0){data.position=resetValue;data.val=getNextValue(data.index++)}bits|=(resb>0?1:0)*power;power<<=1}dictionary[dictSize++]=f(bits);c=dictSize-1;enlargeIn--;break;case 2:return result.join('')}if(enlargeIn==0){enlargeIn=Math.pow(2,numBits);numBits++}if(dictionary[c]){entry=dictionary[c]}else{if(c===dictSize){entry=w+w.charAt(0)}else{return null}}result.push(entry);dictionary[dictSize++]=w+entry.charAt(0);enlargeIn--;w=entry;if(enlargeIn==0){enlargeIn=Math.pow(2,numBits);numBits++}}}};return LZString})();
String.prototype.splic=function(f){return LZString.decompressFromBase64(this).split(f)};
'''
def __init__(self,url,headers):
self.url = url
self.headers = headers
self.imgData = self.get_imgData()
self.basePath = 'F:\\Download\\' + self.imgData['bname'] + '\\' + self.imgData['cname']
self.imgNameUrl = self.get_imgUrl()
def get_imgData(self):
r = requests.get(self.url, headers=self.headers)
#search第一个匹配的结果
script = re.search('(\(function.*?)</script>', r.text).group(1)
#print('js内容',script)
jsonStr = eval_js(self.LzString+script)[12:-12]
#print('json字符串',jsonStr)
imgData = loads(jsonStr)#json字符串反序列化为dict对象
return imgData
def get_imgUrl(self):
urlBase = "https://us.hamreus.com" + self.imgData['path']
info = "?cid=%d&md5="%(self.imgData['cid']) + self.imgData['sl']['md5']
for i, f in enumerate(self.imgData['files']):
page_num = i + 1
downPath = self.basePath + '\\%02d-'%page_num + f
imgUrl = urlBase + f + info
yield (downPath,imgUrl)#元组放到生成器对象
def downImg(self):
length = self.imgData['len']
if length == 0:
return '根据相关法律法规,该漫画无法下载'
Path(self.basePath).mkdir(parents=True, exist_ok=True)#递归创建目录,存在不创建
i = 0
print('\n开始下载', self.imgData['cname'])
for path,url in self.imgNameUrl:#取出生成器的元组
i += 1
print('...%.2f%%'%(100*i/length), end='', flush=True)
if Path(path).exists(): continue #检查文件是否下载了
res = requests.get(url, headers=self.headers)
if res.status_code == 200:
with open(path,'wb') as file:
file.write(res.content)
else:
print(res.text,'Download Failure:',url)
return self.basePath
def main(url,headers):
if re.match('https://www\.manhuagui\.com/comic/\d+\/$',url):
res = requests.get(url, headers=headers)
doc=html.fromstring(res.text)
href = doc.cssselect('a.btn-read')[0].get('href')#获取第一回漫画地址
volUrl = 'https://www.manhuagui.com'+ href
first = Volume(volUrl,headers)
print('准备下载', first.imgData['bname'])
first.downImg()
nextId = first.imgData['nextId']#获取下一回漫画地址id
while nextId != 0:
volUrl = volUrl.replace(str(first.imgData['cid']) + '.', str(nextId) + '.')
first = Volume(volUrl,headers)
first.downImg()
nextId = first.imgData['nextId']
print('\n文件保存在',first.downImg())
else:
if re.match('https://www\.manhuagui\.com/comic/\d+/\d+(_p\d+)?\.html$',url):
first = Volume(url,headers)
print('准备下载', first.imgData['bname'])
print('\n文件保存在',first.downImg())
else: print('请输入正常的连接,\n例如:https://www.manhuagui.com/comic/17023/\n或者 https://www.manhuagui.com/comic/17023/183825.html')
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36',
'Referer':'https://www.manhuagui.com'#解决403 Forbidden
}
if __name__ == '__main__':
while 1:
comic_uri = input('\n请输入manhuagui.com漫画索引页地址[ctrl + c停止]: ')
main(comic_uri,headers)