python练手漫画爬虫,代码,软件成品打包下载链接,效果图

#-*-coding:GBK -*- 
import urllib.request
import lxml
import pyquery
import zlib
import winreg #操作注册表
from bs4 import BeautifulSoup
import requests
import re
import time
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import threading  #多线程
import os
from selenium.webdriver.support.select import Select
import tkinter as tk
import tkinter.messagebox as msg
from tkinter import * 
import win32gui,win32api,win32con
from win32gui import *
def key360(): #获取360浏览器位置
 UnInsKey360 = '360SeSES\shell\open\command'
 key360 = winreg.OpenKey(winreg.HKEY_CLASSES_ROOT, UnInsKey360)
 name,value,type = winreg.EnumValue(key360,0) #注册表键名,键值,数据类型
 num = re.findall(r"(.+?)360se.exe",value)
 num = num[0]+'360se.exe'
 return(num)
def thread_it(func, *args):
    '''将函数打包进线程'''
    # 创建
    t = threading.Thread(target=func, args=args) 
    # 守护 !!!
    t.setDaemon(True) 
    # 启动
    t.start()
    # 阻塞--卡死界面!
    # t.join()  

def getlink(url): #获取漫画章节
    headers = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
    opener = urllib.request.build_opener()
    opener.addheaders = [headers]
    urllib.request.install_opener(opener)
def get_link(url):
    file = urllib.request.urlopen(url).read()
    file = file.decode('ANSI')
    getlink(url)
    #pattern = '(https?://[^\s)";]+(\.(\w|/)*))'
    pattern_path = ('class="pic" title=\"((.*?))\"')
    pattern_path_link = re.compile(pattern_path,re.S).findall(file)
    pattern_path_link = list(pattern_path_link)
    for pattern_path_link in pattern_path_link:
     path = os.getcwd()
     global file_path
     file_path = path + '\\'+ str(pattern_path_link[0])
     if not os.path.exists(file_path):
      os.mkdir(file_path)
     else:
      print (file_path+' 目录已存在')
      continue
    global path_file1
    path_file1 = pattern_path_link[0]
    pattern = ('href=\"((.*?))\"')
    pattern1 = re.compile('class=\"plist pmedium max-h200\".*?>(.*?)</div>', re.S)
    link1 = pattern1.findall(file)#class="plist pmedium max-h200"到第一个</div>范围内内容
    link1 = str(link1)
    #pattern = re.compile('href=\"(.*?)\".*?</a>', re.S)
    link = re.compile(pattern,re.S).findall(link1)
    #print(link)
    
    #去重
    #link = list(set(link))
    global filename
    global filename1
    filename='link.txt'
    filename1='link1.txt'
    for link in link:
      gethtml= 'http://www.pufei8.com' + link[0]
      print(gethtml)
      with open(filename,'a') as file_object:
        file_object.write(gethtml + "\n")
def get_link2(): #获取各个章节链接列表
    link2 = get_page()
    
    for i in link2:
      url=i
      get_link(url)
      with open(filename) as file_object:
         link3 = file_object.readlines()
    return link3
def liulanqi():
    link4 = get_link2()
    __browser_url = key360() ##360浏览器的地址
    chrome_options = Options()
    chrome_options.binary_location = __browser_url
    path1=os.getcwd()
    path =os.path.join(path1,"78.0.39.4.108\chromedriver.exe")  # 谷歌chromedriver完整路径
    # 设置chrome驱动的路径
    #driver = webdriver.Chrome(executable_path=path)
    options=chrome_options
    global driver
    driver = webdriver.Chrome(path,options=chrome_options)
    #options=webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches",['enable-automation']) # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
    driver.maximize_window() # 最大化浏览器
    for i in link4:
       url=i.rstrip()
       driver.get(url)
       tag = driver.find_element_by_tag_name("select")
       getlink(url)
       file = urllib.request.urlopen(url).read()
       file = file.decode('ANSI')
       pattern_path = ('viewname = \"((.*?))\"')
       pattern_path_link = re.compile(pattern_path,re.S).findall(file)
       pattern_path_link = list(pattern_path_link)
       for pattern_path_link in pattern_path_link:
        global file_path
        pattern_path_link1 = str(pattern_path_link[0])
        pattern_path_link1 =re.findall('[\u4e00-\u9fa5a-zA-Z0-9]+',pattern_path_link1,re.S)#
        pattern_path_link1 ="".join(pattern_path_link1)#只要中文,字母,数字
        file_path_zhang = file_path + '\\'+ pattern_path_link1
        if not os.path.exists(file_path_zhang):
          os.mkdir(file_path_zhang)
        else:
         print (file_path_zhang+' 目录已存在')
         continue
       #file_path_zhang = file_path + 
       jpg_path = pattern_path_link[0]
       jpg_path =re.findall('[\u4e00-\u9fa5a-zA-Z0-9]+',jpg_path,re.S)#
       jpg_path ="".join(jpg_path)#只要中文,字母,数字
       a_list = Select(tag).options
       listhua = list(range(1,len(a_list)+1))
       gethtml1=[]
       for i in listhua:
         gethtml2 = url + r'?page=' + str(i)
         gethtml1.append(gethtml2)
         filename1='link1.txt'
         
       for i in gethtml1:
         url1=i
         print(url1)
         b_list = gethtml1.index(i)+ 1
         result = re.search('page=(.*)',url1)
         result1 = result.group(1)
         driver.get(url1)
         tag1 = driver.find_element_by_id("viewimg").get_attribute("src")
         response =requests.get(tag1)
         file_path_img ='{}/{}/{}.{}'.format(path_file1,jpg_path,result1,'jpg')
         if not os.path.exists(file_path_img):
            with open(file_path_img,'wb') as f:
               f.write(response.content)
            if len(a_list) > b_list:
               continue
            else:
               with open('link1.txt','a') as file_object:
                  file_object.write(url + "\n")   #(写入完成列表)
         else:
           if len(a_list) > b_list:
               continue
           else:
               with open('link1.txt','a') as file_object:
                  file_object.write(url + "\n")   #(写入完成列表)
           continue
def test(content):  #输入框内容限定为数字
    # 如果不加上==""的话,就会发现删不完。总会剩下一个数字
    if content.isdigit() or content == "" :
        return True
    else:
        return False
def main_win():
    root1 = tk.Tk()
    root1.resizable(0, 0)
    v1 = tk.StringVar()
    v2 = tk.StringVar()
    root1.title("漫画爬虫")
    screenwidth = root1.winfo_screenwidth()
    screenheight = root1.winfo_screenheight()
    dialog_width = 360
    dialog_height = 180
    root1.geometry("%dx%d+%d+%d" % (dialog_width, dialog_height, (screenwidth-dialog_width)/2, (screenheight-dialog_height)/2))
    def start():
      file = open("link.txt", 'w').close()
      file = open("link1.txt", 'w').close()
      s1.config(state=tk.DISABLED)
      thread_it(liulanqi)
    def stop():
      s1.config(state=tk.NORMAL)
      thread_it(liulanqi)
    def con_start():
      with open("link.txt") as file_object:
         lines = file_object.readlines()
      with open("link1.txt") as file_object:
         lines1 = file_object.readlines()
      lines2 = [i for i in lines if i not in lines1]
      with open("link.txt",'w') as file_object:
        for line2 in lines2:
            file_object.write(line2)
        thread_it(liulanqi)
        s1.config(state=tk.DISABLED)
    test_cmd = root1.register(test)
    v = IntVar()
    global e
    e = Entry(root1,
              width=10,
              textvariable=v,
              validate='key',  # 发生任何变动的时候,就会调用validatecommand
              validatecommand=(test_cmd, '%P')  # %P代表输入框的实时内容
              )
    e.place(x=150,y=20)
    s1 = tk.Button(root1, text='开始下载', font=('宋体', 12), width=8, height=1, command=lambda :thread_it(start))
    s1.place(x=100,y=60)
    s2 = tk.Button(root1, text='继续下载', font=('宋体', 12), width=8, height=1, command=lambda :thread_it(con_start))
    s2.place(x=200,y=60)
    s2 = tk.Button(root1, text='打开网页', font=('宋体', 12), width=8, height=1, command=lambda :thread_it(open_page))
    s2.place(x=150,y=120)
    root1.mainloop()

def get_page():#获取网页链接
    url1 = 'http://www.pufei8.com/manhua/'
    global eput
    eput = e.get()
    #eput = float(eput)
    eput = int(eput)
    list1 = list(range(eput,eput+1))
    list2 = []
    page2 = []
    for i in list1:
        list2.append(str(i))
    for i in list2:
        url= url1 + i
        page2.append(url)
    return(page2)  
def open_page():
    __browser_url = key360() ##360浏览器的地址
    chrome_options = Options()
    chrome_options.binary_location = __browser_url
    path1=os.getcwd()
    path =os.path.join(path1,"78.0.39.4.108\chromedriver.exe")  # 谷歌chromedriver完整路径
    # 设置chrome驱动的路径
    #driver = webdriver.Chrome(executable_path=path)
    options=chrome_options
    global driver
    driver = webdriver.Chrome(path,options=chrome_options)
    #options=webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches",['enable-automation']) # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
    driver.maximize_window() # 最大化浏览器
    driver.get("http://www.pufei8.com")
main_win()




360浏览器必装,自己的360浏览器可能因为内核版本不同而停留在data页面
没做病毒免杀,安装请关掉卫士杀毒或添加白名单使用
任选一部漫画 举例 http://www.pufei8.com/manhua/1/209670.html?page=14
manhua/后面的1就是要爬的整部漫画,在输入框输入即可,如果没有对应链接会报错,异常没有处理,退出重来就可以了。
漫画没有下载完整而关掉了软件可以再打开点击继续下载,这时不要点击开始下载,会清空下载进度。
这个网站有时会刷不出图片而跳过下载,只好把下载失败的图片删除重试了。
下载的漫画就生成在软件所在的目录文件夹
看图王可以对图片自动排序和跳过文件夹浏览图片非常方便和适合这种目录结构的漫画
有其他问题可以CNDS上私信我。
在这里插入图片描述
成品链接:https://pan.baidu.com/s/1hroQfMupsoEGGimSUQ9g_Q
提取码:cgqw

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

我文非相

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值