#-*-coding:GBK -*-
import urllib.request
import lxml
import pyquery
import zlib
import winreg #操作注册表
from bs4 import BeautifulSoup
import requests
import re
import time
from selenium.webdriver.chrome.options import Options
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
import threading #多线程
import os
from selenium.webdriver.support.select import Select
import tkinter as tk
import tkinter.messagebox as msg
from tkinter import *
import win32gui,win32api,win32con
from win32gui import *
def key360(): #获取360浏览器位置
UnInsKey360 = '360SeSES\shell\open\command'
key360 = winreg.OpenKey(winreg.HKEY_CLASSES_ROOT, UnInsKey360)
name,value,type = winreg.EnumValue(key360,0) #注册表键名,键值,数据类型
num = re.findall(r"(.+?)360se.exe",value)
num = num[0]+'360se.exe'
return(num)
def thread_it(func, *args):
'''将函数打包进线程'''
# 创建
t = threading.Thread(target=func, args=args)
# 守护 !!!
t.setDaemon(True)
# 启动
t.start()
# 阻塞--卡死界面!
# t.join()
def getlink(url): #获取漫画章节
headers = ("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.94 Safari/537.36")
opener = urllib.request.build_opener()
opener.addheaders = [headers]
urllib.request.install_opener(opener)
def get_link(url):
file = urllib.request.urlopen(url).read()
file = file.decode('ANSI')
getlink(url)
#pattern = '(https?://[^\s)";]+(\.(\w|/)*))'
pattern_path = ('class="pic" title=\"((.*?))\"')
pattern_path_link = re.compile(pattern_path,re.S).findall(file)
pattern_path_link = list(pattern_path_link)
for pattern_path_link in pattern_path_link:
path = os.getcwd()
global file_path
file_path = path + '\\'+ str(pattern_path_link[0])
if not os.path.exists(file_path):
os.mkdir(file_path)
else:
print (file_path+' 目录已存在')
continue
global path_file1
path_file1 = pattern_path_link[0]
pattern = ('href=\"((.*?))\"')
pattern1 = re.compile('class=\"plist pmedium max-h200\".*?>(.*?)</div>', re.S)
link1 = pattern1.findall(file)#class="plist pmedium max-h200"到第一个</div>范围内内容
link1 = str(link1)
#pattern = re.compile('href=\"(.*?)\".*?</a>', re.S)
link = re.compile(pattern,re.S).findall(link1)
#print(link)
#去重
#link = list(set(link))
global filename
global filename1
filename='link.txt'
filename1='link1.txt'
for link in link:
gethtml= 'http://www.pufei8.com' + link[0]
print(gethtml)
with open(filename,'a') as file_object:
file_object.write(gethtml + "\n")
def get_link2(): #获取各个章节链接列表
link2 = get_page()
for i in link2:
url=i
get_link(url)
with open(filename) as file_object:
link3 = file_object.readlines()
return link3
def liulanqi():
link4 = get_link2()
__browser_url = key360() ##360浏览器的地址
chrome_options = Options()
chrome_options.binary_location = __browser_url
path1=os.getcwd()
path =os.path.join(path1,"78.0.39.4.108\chromedriver.exe") # 谷歌chromedriver完整路径
# 设置chrome驱动的路径
#driver = webdriver.Chrome(executable_path=path)
options=chrome_options
global driver
driver = webdriver.Chrome(path,options=chrome_options)
#options=webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches",['enable-automation']) # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
driver.maximize_window() # 最大化浏览器
for i in link4:
url=i.rstrip()
driver.get(url)
tag = driver.find_element_by_tag_name("select")
getlink(url)
file = urllib.request.urlopen(url).read()
file = file.decode('ANSI')
pattern_path = ('viewname = \"((.*?))\"')
pattern_path_link = re.compile(pattern_path,re.S).findall(file)
pattern_path_link = list(pattern_path_link)
for pattern_path_link in pattern_path_link:
global file_path
pattern_path_link1 = str(pattern_path_link[0])
pattern_path_link1 =re.findall('[\u4e00-\u9fa5a-zA-Z0-9]+',pattern_path_link1,re.S)#
pattern_path_link1 ="".join(pattern_path_link1)#只要中文,字母,数字
file_path_zhang = file_path + '\\'+ pattern_path_link1
if not os.path.exists(file_path_zhang):
os.mkdir(file_path_zhang)
else:
print (file_path_zhang+' 目录已存在')
continue
#file_path_zhang = file_path +
jpg_path = pattern_path_link[0]
jpg_path =re.findall('[\u4e00-\u9fa5a-zA-Z0-9]+',jpg_path,re.S)#
jpg_path ="".join(jpg_path)#只要中文,字母,数字
a_list = Select(tag).options
listhua = list(range(1,len(a_list)+1))
gethtml1=[]
for i in listhua:
gethtml2 = url + r'?page=' + str(i)
gethtml1.append(gethtml2)
filename1='link1.txt'
for i in gethtml1:
url1=i
print(url1)
b_list = gethtml1.index(i)+ 1
result = re.search('page=(.*)',url1)
result1 = result.group(1)
driver.get(url1)
tag1 = driver.find_element_by_id("viewimg").get_attribute("src")
response =requests.get(tag1)
file_path_img ='{}/{}/{}.{}'.format(path_file1,jpg_path,result1,'jpg')
if not os.path.exists(file_path_img):
with open(file_path_img,'wb') as f:
f.write(response.content)
if len(a_list) > b_list:
continue
else:
with open('link1.txt','a') as file_object:
file_object.write(url + "\n") #(写入完成列表)
else:
if len(a_list) > b_list:
continue
else:
with open('link1.txt','a') as file_object:
file_object.write(url + "\n") #(写入完成列表)
continue
def test(content): #输入框内容限定为数字
# 如果不加上==""的话,就会发现删不完。总会剩下一个数字
if content.isdigit() or content == "" :
return True
else:
return False
def main_win():
root1 = tk.Tk()
root1.resizable(0, 0)
v1 = tk.StringVar()
v2 = tk.StringVar()
root1.title("漫画爬虫")
screenwidth = root1.winfo_screenwidth()
screenheight = root1.winfo_screenheight()
dialog_width = 360
dialog_height = 180
root1.geometry("%dx%d+%d+%d" % (dialog_width, dialog_height, (screenwidth-dialog_width)/2, (screenheight-dialog_height)/2))
def start():
file = open("link.txt", 'w').close()
file = open("link1.txt", 'w').close()
s1.config(state=tk.DISABLED)
thread_it(liulanqi)
def stop():
s1.config(state=tk.NORMAL)
thread_it(liulanqi)
def con_start():
with open("link.txt") as file_object:
lines = file_object.readlines()
with open("link1.txt") as file_object:
lines1 = file_object.readlines()
lines2 = [i for i in lines if i not in lines1]
with open("link.txt",'w') as file_object:
for line2 in lines2:
file_object.write(line2)
thread_it(liulanqi)
s1.config(state=tk.DISABLED)
test_cmd = root1.register(test)
v = IntVar()
global e
e = Entry(root1,
width=10,
textvariable=v,
validate='key', # 发生任何变动的时候,就会调用validatecommand
validatecommand=(test_cmd, '%P') # %P代表输入框的实时内容
)
e.place(x=150,y=20)
s1 = tk.Button(root1, text='开始下载', font=('宋体', 12), width=8, height=1, command=lambda :thread_it(start))
s1.place(x=100,y=60)
s2 = tk.Button(root1, text='继续下载', font=('宋体', 12), width=8, height=1, command=lambda :thread_it(con_start))
s2.place(x=200,y=60)
s2 = tk.Button(root1, text='打开网页', font=('宋体', 12), width=8, height=1, command=lambda :thread_it(open_page))
s2.place(x=150,y=120)
root1.mainloop()
def get_page():#获取网页链接
url1 = 'http://www.pufei8.com/manhua/'
global eput
eput = e.get()
#eput = float(eput)
eput = int(eput)
list1 = list(range(eput,eput+1))
list2 = []
page2 = []
for i in list1:
list2.append(str(i))
for i in list2:
url= url1 + i
page2.append(url)
return(page2)
def open_page():
__browser_url = key360() ##360浏览器的地址
chrome_options = Options()
chrome_options.binary_location = __browser_url
path1=os.getcwd()
path =os.path.join(path1,"78.0.39.4.108\chromedriver.exe") # 谷歌chromedriver完整路径
# 设置chrome驱动的路径
#driver = webdriver.Chrome(executable_path=path)
options=chrome_options
global driver
driver = webdriver.Chrome(path,options=chrome_options)
#options=webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches",['enable-automation']) # 此步骤很重要,设置为开发者模式,防止被各大网站识别出来使用了Selenium
driver.maximize_window() # 最大化浏览器
driver.get("http://www.pufei8.com")
main_win()
360浏览器必装,自己的360浏览器可能因为内核版本不同而停留在data页面
没做病毒免杀,安装请关掉卫士杀毒或添加白名单使用
任选一部漫画 举例 http://www.pufei8.com/manhua/1/209670.html?page=14
manhua/后面的1就是要爬的整部漫画,在输入框输入即可,如果没有对应链接会报错,异常没有处理,退出重来就可以了。
漫画没有下载完整而关掉了软件可以再打开点击继续下载,这时不要点击开始下载,会清空下载进度。
这个网站有时会刷不出图片而跳过下载,只好把下载失败的图片删除重试了。
下载的漫画就生成在软件所在的目录文件夹
看图王可以对图片自动排序和跳过文件夹浏览图片非常方便和适合这种目录结构的漫画
有其他问题可以CNDS上私信我。
成品链接:https://pan.baidu.com/s/1hroQfMupsoEGGimSUQ9g_Q
提取码:cgqw