python 爬取动漫之家,下载漫画

#!/usr/bin/python3
# -*- coding: utf-8 -*-


import requests  # 发送http请求
from bs4 import BeautifulSoup  # 解析html
import os # 创建目录
import urllib # 解析URL
from urllib import parse # 解析URL
import json # json 转 dict
import re # 正则匹配
import jsbeautifier # 解密混淆加密js代码
import js2py # 执行js代码
import time

class p_dmzj(object):
    # 初始化
    def __init__(self, url, save_path, headers, keywords):
        self.url = url
        self.save_path = save_path
        self.headers = headers
        self.keywords = keywords
        self.page = 1
        self.comic_url = [] # 漫画链接
        self.scheme = 'https:'
        self.is_cover = False # 是否覆盖 True 覆盖 False 不覆盖

    # 创建目录
    def create_folder(self, path):
        if not os.path.isdir(path):
            os.makedirs(path)

    # 得到字符串里面的数字
    def find_str_num(self, m_str):
        n_str = ''
        for i in m_str:
            if i.isdecimal():
                n_str += i
        return n_str

    # 通过搜索找到漫画目录
    def get_link_by_search(self, url):
        response = requests.get(url=url, headers=self.headers)  # 发起请求

        js_str = response.text
        js_str = '''
            function fun(){
                %s
                ;
                return g_search_data;
            }
        ''' % js_str

        g_search_data = js2py.eval_js(js_str)
        comic_list = list(g_search_data())

        for i in comic_list:
            if i['comic_name'] == self.keywords:
                return i['comic_url'] if i['comic_url'].find(self.scheme) > -1 else self.scheme + i['comic_url']
        else:
            raise BaseException('没有找到漫画目录地址')

    # 处理url
    def deal_url(self, parse, deal_url):
        if deal_url.find(self.scheme) > -1:
            return deal_url

        if deal_url.find('//') > -1:
            return self.scheme + deal_url

        if deal_url.find('/') == 0:
            return parse.scheme+'://'+parse.netloc+deal_url
        elif deal_url.find('../') == 0:
            deal_url = deal_url[2:]
            path = parse.path[0:parse.path.rfind('/')]
            return parse.scheme+'://'+parse.netloc+path+deal_url
        else:
            if deal_url.find('./') == 0:
                deal_url = deal_url[2:]
            return parse.scheme+'://'+parse.netloc+parse.path+'/'+deal_url

    # 进入漫画目录链接 找到所有漫画页链接 shtml
    def get_comic_link(self, url):
        response = requests.get(url=url, headers=self.headers)  # 发起请求
        soup = BeautifulSoup(response.text, "html.parser")  # 解析html内容

        url_list = soup.find('div', class_='cartoon_online_border').findAll('a')
        self.headers['Referer'] = url

        parse = urllib.parse.urlparse(url)

        ss = set()
        for i in url_list:
            ss.add(self.deal_url(parse, i['href']))

        return ss

    # 下载文件
    def down_file(self, url, headers, file_path):
        if not self.is_cover and os.path.isfile(file_path):
            return
        time.sleep(2)
        response = requests.get(url=url, headers=headers)  # 发起请求
        self.create_folder(file_path[:file_path.rfind('/')]) # 创建目录

        with open(file_path, "wb") as f:
            f.write(response.content)
            print('成功下载文件: %s' % url)

    # 进入漫画页,下载漫画
    def get_img(self, comic_html_link):
        img_prefix = self.scheme+"//images.dmzj.com/"

        for i in comic_html_link:
            response = requests.get(url=i, headers=self.headers)  # 发起请求
            soup = BeautifulSoup(response.text, "html.parser")  # 解析html内容

            chapter = soup.find('span', class_='redhotl').text
            # chapter = self.find_str_num(chapter)
            js_str = soup.find('script').text
            # js_str = jsbeautifier.beautify(js_str)

            js_str = '''
                function fun(){
                    %s
                    ;
                    return arr_pages;
                }
            ''' % js_str
            get_arr_pages = js2py.eval_js(js_str)
            arr_pages = list(get_arr_pages())

            headers = self.headers
            headers['Referer'] = i
            for j in arr_pages:
                file_path = self.save_path + self.keywords + '/' + chapter + j[j.rfind('/'):]  # 文件
                self.down_file(img_prefix+j, headers, file_path)


    # 主程序
    def main(self):
        comic_url_raw = self.get_link_by_search(self.url) # 漫画目录
        comic_html_link = self.get_comic_link(comic_url_raw) # 漫画章节
        self.get_img(comic_html_link) # 进入漫画页,下载漫画


if __name__ == '__main__':
    keywords = '一拳超人' # 一拳超人
    url = 'https://sacg.dmzj.com/comicsum/search.php?s='+urllib.parse.quote(keywords)
    # print(url)

    save_path = 'C:/Users/Administrator/Desktop/tmp/'
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6788.400 QQBrowser/10.3.2864.400',
        'Referer': 'https://manhua.dmzj.com/mxwbt/'
    }

    p_dmzj(url, save_path, headers, keywords).main()

 

  • 1
    点赞
  • 10
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值