漫画爬虫
简介
爬取公开漫画资源,下载完成后打包成Zip发送至手机指定文件夹。
20201020:新增manganelo爬虫,也是英文漫画,使用了beautifulsoup解析网页,同时使用了多线程、requests重试等模块,代码见下文末~代码中引用的gm是我自己写的一个能用工具包,这里只用到了随机head而已
因为是爬虫,多少是存在侵犯版权的可能,侵删啦~
涉及组件
第三方包依赖
- selenium(对应的webdriver需要自行安装)
- tqdm
手机端
- termux(通过ssh与电脑通信)
- comicscreen(本地漫画阅读软件)
脚本缺点
- 没有写搜索功能,多个来源没有整合
- 非异步爬虫,效率不高,写来玩儿的,能满足我的需求
- 爬虫没有使用xpath、css选择器等技术,直接用的正则,因为图个方便
- 20201020:最新爬虫已添加多线程、beautifulsoup等模块,也优化了细节,效率提升很多,我自己现在的方案是使用腾讯去搭建smb服务,在线看下载好的漫画
代码
主class代码,已经适配了taduo及ipufei
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: CK
# Date: 2020-02-15
import time
import requests, random, re, os
from selenium import webdriver
from tqdm import tqdm
class CartoonDownloader:
def __init__(self, title_re='', url_list_re='', page_list_re='',
viewname_re='', img_url_re='', coding='', url_page_str=''):
self.title = ''
self.title_re = title_re
self.coding = coding
self.url_list_re = url_list_re
self.page_list_re = page_list_re
self.viewname_re = viewname_re
self.img_url_re = img_url_re
self.url_page_str = url_page_str
@staticmethod
def get_headers():
with open("headers.csv") as ua:
user_agent_list = ua.readlines()
return user_agent_list
@staticmethod
def remove_illegal_character(str):
flag = 0
position = []
invalid_symbol = '() '
str_lis = list(str)
for i in range(len(str_lis)):
if str_lis[i] in invalid_symbol:
position.append(i)
for j in position:
str_lis.remove(str_lis[j + flag])
flag -= 1
return ''.join(str_lis)
def get_all_pages(self, url):
final_list = []
r = requests.get(url=url, headers={"User-Agent": random.choice(self.get_headers()).strip()}, timeout=3)
if r.status_code == 200:
c = r.content.decode(encoding=self.coding)
self.title = self.remove_illegal_character(re.findall(self.title_re, c)[0].strip())
if not os.path.exists(self.title):
os.mkdir(self.title)
url_list = re.findall(self.url_list_re, c)
url_list.reverse()
for i in url_list:
final_list.append(os.path.join(url, i))
return final_list
def get_all_images(self, url):
# 构造出driver
option = webdriver.ChromeOptions()
option.add_argument("headless")
driver = webdriver.Chrome(chrome_options=option)
# 先获取总页面数
driver.get(url)
c = driver.page_source
page_list = re.findall(self.page_list_re, c)
try:
page_num = int(page_list[0])
except IndexError:
return
# 获取页面标题
try:
viewname = re.findall(self.viewname_re, c)[0].split(',')[1]
except IndexError:
viewname = re.findall(self.viewname_re, c)[0]
child_folder = os.path.join(self.title, viewname)
print(child_folder)
time.sleep(0.01)
if not os.path.exists(child_folder):
os.mkdir(child_folder)
bar = tqdm(range(1, page_num + 1)) # tqdm显示进度条
for i in bar:
current_url = url + self.url_page_str + str(i)
driver.get(current_url)
# 获取图片内容
img_url = re.findall(self.img_url_re, driver.page_source)
# print(page_num, viewname, img_url)
try:
self.downloader(img_url[0], os.path.join(child_folder, str(i) + '.jpg'))
except IndexError:
pass
# time.sleep(random.random() * 3)
driver.quit()
def downloader(self, url, path):
headers = {'User-Agent': random.choice(self.get_headers()).strip()}
try:
r = requests.get(url=url, headers=headers, timeout=10)
except:
# 小心会无限重试
self.downloader(url, path)
return
if r.status_code == 200:
open(path, 'wb').write(r.content)
else:
print(r.status_code, url)
del r
def trigger(self, url, index):
# 判断是全部下载还是单独下载某集
# 下截某集直接调用downloader,否则通过get_all_list获取所有地址
count_num = index
if url[-5:] != '.html':
all_pages = self.get_all_pages(url)
for i in all_pages[index - 1:]:
print('~~~~~~~~~~ 共%s集,当前第%s集,完成度%s ~~~~~~~~~~' % (
len(all_pages), count_num, str(round(count_num / len(all_pages) * 100, 2)) + '%'))
self.get_all_images(i)
count_num += 1
# 直接引用bash文件
os.system('sh sender.sh %s' % self.title)
else:
self.get_all_images(url)
if __name__ == '__main__':
# taduo
# 匹配漫画名
title_re = r'<h1>(.*?)</h1>'
# 匹配所有集链接
url_list_re = r'href=\"\/.*?\/.*?\/(.+?l)\"'
# 匹配总页数
page_list_re = r'>\/(\d+?)P<\/span>'
# 匹配单集名
viewname_re = r'words\" content=\"(.*?)\"'
# 匹配漫画图片链接
img_url_re = r'src=\"(http.*?jpg)\"'
# 网页编码方式
coding = 'GBK'
# 页面构成字段
url_page_str = '?page="' # 每个网站会不太一样
# 漫画链接
url = 'http://m.taduo.net/manhua/1/'
# ipufei,我似乎被屏蔽了
# title_re = r'class=\"titleInfo\"><h1>(.+?)<\/h1>'
# url_list_re = r'.*?manhua\/\d+?\/(\d+?\.html)\" title'
# page_list_re = r'option value=\"(\d+)\"'
# viewname_re = r'viewname\ =\ \"(.+?)\"\;'
# img_url_re = r'img src=\"(http:.+?)\" onerror'
# coding = 'GBK'
# url_page_str = '?page="
# url = 'http://www.ipufei.com/manhua/149/'
cd = CartoonDownloader(title_re=title_re, url_list_re=url_list_re, page_list_re=page_list_re,
viewname_re=viewname_re, img_url_re=img_url_re, coding=coding,
url_page_str=url_page_str)
cd.trigger(url, 1)
另有一个国外的漫画网站,mangaowl,英文的,因为漫画都是口语化的东西嘛,所以可以用来提升语感
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: CK
# Date: 2020-02-17
import os
import random
import re
import time
from tqdm import tqdm
from selenium import webdriver
from dl_class import CartoonDownloader
class DM5(CartoonDownloader):
viewname = 0
def get_all_pages(self, url):
# 构造出driver
option = webdriver.ChromeOptions()
option.add_argument("headless")
driver = webdriver.Chrome(chrome_options=option)
# 先获取总页面数
driver.get(url)
c = driver.page_source
# open('ha.html', 'w').write(c)
self.title = self.remove_illegal_character(re.findall(self.title_re, c)[0].strip())
if not os.path.exists(self.title):
os.mkdir(self.title)
url_list = re.findall(self.url_list_re, c)
url_list.reverse()
title_list = [i for i in range(len(url_list))]
DM5.dic = dict(zip(url_list, title_list))
driver.quit()
return url_list
def get_all_images(self, url):
option = webdriver.ChromeOptions()
option.add_argument("headless")
driver = webdriver.Chrome(chrome_options=option)
driver.get(url)
c = driver.page_source
# open('ha.html', 'w').write(c)
img_list = re.findall(self.img_url_re, c)
# 直接用数字命名文件夹
child_folder = os.path.join(self.title, str(DM5.viewname))
print(child_folder)
time.sleep(0.01)
if not os.path.exists(child_folder):
os.mkdir(child_folder)
bar = tqdm(range(len(img_list))) # tqdm显示进度条
for i in bar:
# 获取图片内容
try:
self.downloader(img_list[i], os.path.join(child_folder, str(i) + '.jpg'))
except IndexError:
print(img_list[i])
time.sleep(random.random())
driver.quit()
return img_list
def trigger(self, url, index):
# 判断是全部下载还是单独下载某集
# 下截某集直接调用downloader,否则通过get_all_list获取所有地址
DM5.viewname = index
try:
int(url[-4:])
self.get_all_images(url)
except ValueError:
all_pages = self.get_all_pages(url)
# print(all_pages, DM5.dic)
for i in all_pages[index - 1:]:
# 单集失败重试次数
flag = 10
print('~~~~~~~~~~ 共%s集,当前第%s集,完成度%s ~~~~~~~~~~' % (
len(all_pages), DM5.viewname, str(round(DM5.viewname / len(all_pages) * 100, 2)) + '%'))
result = self.get_all_images(i)
while not result and flag > 0:
result = self.get_all_images(i)
flag -= 1
if flag == 0:
print('~~~~~~~~~~ 第%s集下载失败 ~~~~~~~~~~' % index)
DM5.viewname += 1
os.system('sh sender.sh %s' % self.title)
if __name__ == '__main__':
title_re = r'title=\"(.*?)\"'
url_list_re = r'href=\"(http.*?\d{4})\"'
img_url_re = r'data-src=\"(h.*?jpg)\"'
# url = 'https://mangaowl.com/single/92/real' # Real
url = 'https://mangaowl.com/single/47172/onepunch-man-one-' # One Punch Man
cd = DM5(title_re=title_re, url_list_re=url_list_re, img_url_re=img_url_re)
# 外网不稳,失败重试三次
times = 3
while times:
try:
cd.trigger(url, 123)
break
# 如何未读取到正确网页,正则会匹配失败,取空列表元素报错IndexError
except IndexError:
times -= 1
最后是shell脚本,大概的使用方法写在注释里边了
#!/usr/bin/env bash
# path是漫画的下载父级地址
path=/Users/kelvinchi/Documents/myProject/myItchat/manga_downloader/
if [ -n "$1" ]; then
# 通过父级地址与输入参数构建出需打包漫画的绝对地址
curpath=$path$1
# 打包后删除再发送
# 注意发送通过ssh实现,手机端我使用termux,无需root,安装后需要给予本地存储读取权限,输入sshd打开ssh功能
# 注意端口不是22而是8022,最终的手机存储地址需要使用ln -s建立软链接
zip $curpath.zip -r $curpath &&
rm -r $curpath &&
scp -P 8022 $curpath.zip root@mix:~/storage/1
echo 传输完成
else
echo 需要传入文件夹名
fi
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: CK
# Date: 2020/10/18
import os, sys
import random
import re
import time
from selenium import webdriver
from tqdm import tqdm
from bs4 import BeautifulSoup
import requests
from requests.adapters import HTTPAdapter
import threading
file_path = os.path.abspath(sys.argv[0])
abs_path = file_path[: file_path.rfind('/')]
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from general_methods import gm
class Manganelo:
def __init__(self):
self.view_name = 0
self.dic = {}
# 控制线程数量
self.thread_num = threading.BoundedSemaphore(8)
# 设置log文件夹
self.log = os.path.join(abs_path, 'manganelo.log')
# 设置重试次数
self.s = requests.Session()
self.s.mount('http://', HTTPAdapter(max_retries=5))
self.s.mount('https://', HTTPAdapter(max_retries=5))
def get_all_pages(self, url):
"""
用心获取主页面内所有漫画链接
:param url:
:return:
"""
header = {
'referer': 'https://manganelo.com/',
'user-agent': random.choice(gm.get_headers()).strip()
}
try:
req = self.s.get(url, headers=header, timeout=5)
c = req.text
# open('ha.html', 'w').write(c)
soup = BeautifulSoup(c, 'lxml')
info = str(soup.findAll('a', {'class': 'chapter-name text-nowrap'}))
page_list = re.findall(r'(https:.*?)\"', info)
title_info = str(soup.findAll('div', {'class': 'story-info-right'}))
title = re.findall(r'<h1>(.+?)</h1>', title_info)[0].replace(' ', '_')
return page_list, title
except requests.exceptions.ConnectTimeout:
string = '\033[01;31mTimeout\033[0m'
print(string)
os.system('echo %s >> %s' % (string, self.log))
def get_all_images(self, url, title):
"""
获取url内指定范围内所有图片链接
:param url:
:param title:
:return:
"""
header = {
'referer': 'https://manganelo.com/',
'user-agent': random.choice(gm.get_headers()).strip()
}
threads = []
try:
req = self.s.get(url, headers=header, timeout=10)
except requests.exceptions.ConnectTimeout:
string = '\033[01;31mTimeout\033[0m'
print(string)
os.system('echo %s >> %s' % (string, self.log))
return None
c = req.text
# open('ha.html', 'w').write(c)
soup = BeautifulSoup(c, 'lxml')
info = str(soup.findAll('div', {'class': 'container-chapter-reader'}))
img_list = re.findall(r'https:.*?\.jpg', info)
folder_name = os.path.join(abs_path, '%s/%s' % (title, url.split('/')[-1]))
os.makedirs(folder_name, exist_ok=True)
print('\033[01;32m%s开始下载...\033[0m' % url.split('/')[-1])
for pic in img_list:
t = threading.Thread(target=self.downloader, args=(pic, folder_name,))
threads.append(t)
t.start()
for thread in tqdm(threads):
thread.join()
# print('%s Done' % url)
return img_list
def downloader(self, url, path):
"""
线程锁用以控制并发数量
:param url:
:param path:
:return:
"""
header = {
'referer': 'https://manganelo.com/',
'user-agent': random.choice(gm.get_headers()).strip()
}
self.thread_num.acquire()
# print(url)
try:
req = self.s.get(url, headers=header)
with open(os.path.join(path, url.split('/')[-1].zfill(7)), 'wb') as t:
t.write(req.content)
except Exception:
string = '%s下载失败' % url
print(string)
os.system('echo %s >> %s' % (string, self.log))
self.thread_num.release()
def trigger(self, url, index):
# 判断是全部下载还是单独下载某集
# 下截某集直接调用downloader,否则通过get_all_list获取所有地址
self.view_name = index
try:
float(url.split('_')[-1])
self.get_all_images(url, 'Temp')
except ValueError:
all_pages_info = self.get_all_pages(url)
pages_list = all_pages_info[0]
pages_list.reverse()
title = all_pages_info[1]
# print(all_pages, DM5.dic)
for i in pages_list[index - 1:]:
print('\033[07;36m %s共%s集,当前第%s集,完成度%s\033[0m' % (
title, len(pages_list), self.view_name, str(round(self.view_name / len(pages_list) * 100, 2)) + '%'))
result = self.get_all_images(i, title)
self.view_name += 1
time.sleep(random.randint(0, 3))
pics_path = os.path.join(abs_path, title)
target_path = '/home/ck/manga'
print('\033[01;32m开始压缩...\033[0m')
os.system('cd %s; rm -rf %s.zip; zip -qr %s.zip %s; rm -rf %s' %
(abs_path, title, title, title, pics_path))
print('\033[01;32m开始移动...\033[0m')
os.system('mkdir -p %s; cd %s; mv %s.zip %s' % (target_path, abs_path, title, target_path))
os.system('chmod 755 %s.zip' % os.path.join(target_path, title))
print('\033[01;32m下载完成\033[0m')
def main():
len_argv = len(sys.argv) - 1
if len_argv == 1:
page = int(sys.argv[1])
else:
page = 0
print('\033[01;31m请输入起始集号\033[0m')
exit()
url = 'https://manganelo.com/manga/read_one_piece_manga_online_free4'
m = Manganelo()
m.trigger(url, page)
if __name__ == '__main__':
main()