#!/usr/bin/python3
# -*- coding: utf-8 -*-
import requests # 发送http请求
from bs4 import BeautifulSoup # 解析html
import os # 创建目录
import urllib # 解析URL
from urllib import parse # 解析URL
import json # json 转 dict
import re # 正则匹配
import jsbeautifier # 解密混淆加密js代码
import js2py # 执行js代码
import time
class p_dmzj(object):
# 初始化
def __init__(self, url, save_path, headers, keywords):
self.url = url
self.save_path = save_path
self.headers = headers
self.keywords = keywords
self.page = 1
self.comic_url = [] # 漫画链接
self.scheme = 'https:'
self.is_cover = False # 是否覆盖 True 覆盖 False 不覆盖
# 创建目录
def create_folder(self, path):
if not os.path.isdir(path):
os.makedirs(path)
# 得到字符串里面的数字
def find_str_num(self, m_str):
n_str = ''
for i in m_str:
if i.isdecimal():
n_str += i
return n_str
# 通过搜索找到漫画目录
def get_link_by_search(self, url):
response = requests.get(url=url, headers=self.headers) # 发起请求
js_str = response.text
js_str = '''
function fun(){
%s
;
return g_search_data;
}
''' % js_str
g_search_data = js2py.eval_js(js_str)
comic_list = list(g_search_data())
for i in comic_list:
if i['comic_name'] == self.keywords:
return i['comic_url'] if i['comic_url'].find(self.scheme) > -1 else self.scheme + i['comic_url']
else:
raise BaseException('没有找到漫画目录地址')
# 处理url
def deal_url(self, parse, deal_url):
if deal_url.find(self.scheme) > -1:
return deal_url
if deal_url.find('//') > -1:
return self.scheme + deal_url
if deal_url.find('/') == 0:
return parse.scheme+'://'+parse.netloc+deal_url
elif deal_url.find('../') == 0:
deal_url = deal_url[2:]
path = parse.path[0:parse.path.rfind('/')]
return parse.scheme+'://'+parse.netloc+path+deal_url
else:
if deal_url.find('./') == 0:
deal_url = deal_url[2:]
return parse.scheme+'://'+parse.netloc+parse.path+'/'+deal_url
# 进入漫画目录链接 找到所有漫画页链接 shtml
def get_comic_link(self, url):
response = requests.get(url=url, headers=self.headers) # 发起请求
soup = BeautifulSoup(response.text, "html.parser") # 解析html内容
url_list = soup.find('div', class_='cartoon_online_border').findAll('a')
self.headers['Referer'] = url
parse = urllib.parse.urlparse(url)
ss = set()
for i in url_list:
ss.add(self.deal_url(parse, i['href']))
return ss
# 下载文件
def down_file(self, url, headers, file_path):
if not self.is_cover and os.path.isfile(file_path):
return
time.sleep(2)
response = requests.get(url=url, headers=headers) # 发起请求
self.create_folder(file_path[:file_path.rfind('/')]) # 创建目录
with open(file_path, "wb") as f:
f.write(response.content)
print('成功下载文件: %s' % url)
# 进入漫画页,下载漫画
def get_img(self, comic_html_link):
img_prefix = self.scheme+"//images.dmzj.com/"
for i in comic_html_link:
response = requests.get(url=i, headers=self.headers) # 发起请求
soup = BeautifulSoup(response.text, "html.parser") # 解析html内容
chapter = soup.find('span', class_='redhotl').text
# chapter = self.find_str_num(chapter)
js_str = soup.find('script').text
# js_str = jsbeautifier.beautify(js_str)
js_str = '''
function fun(){
%s
;
return arr_pages;
}
''' % js_str
get_arr_pages = js2py.eval_js(js_str)
arr_pages = list(get_arr_pages())
headers = self.headers
headers['Referer'] = i
for j in arr_pages:
file_path = self.save_path + self.keywords + '/' + chapter + j[j.rfind('/'):] # 文件
self.down_file(img_prefix+j, headers, file_path)
# 主程序
def main(self):
comic_url_raw = self.get_link_by_search(self.url) # 漫画目录
comic_html_link = self.get_comic_link(comic_url_raw) # 漫画章节
self.get_img(comic_html_link) # 进入漫画页,下载漫画
if __name__ == '__main__':
keywords = '一拳超人' # 一拳超人
url = 'https://sacg.dmzj.com/comicsum/search.php?s='+urllib.parse.quote(keywords)
# print(url)
save_path = 'C:/Users/Administrator/Desktop/tmp/'
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400',
'Referer': 'https://manhua.dmzj.com/mxwbt/'
}
p_dmzj(url, save_path, headers, keywords).main()