# 爬取哦漫画图片并下载到相应文件夹
from selenium import webdriver
import time
import requests
from bs4 import BeautifulSoup
import re
from urllib import request,parse
import os
# 1.获取漫画章节链接
phantom = webdriver.PhantomJS(executable_path=r'E:\Python\phantomjs-2.1.1-windows\bin\phantomjs.exe')
# 获取章节链接
def getSectionLink():
base_url = 'http://www.omanhua.com/comic/4014/'
response = requests.get(base_url)
response.encoding = 'utf-8'
html = response.text
html = BeautifulSoup(html,'lxml')
# 创建漫画文件夹
# 获取漫画名称
manga_name = html.select('div.main01_content h2')[0].text.strip('漫画简介:')
manga_path = 'img/' + manga_name
if not os.path.exists(manga_path):
os.makedirs(manga_path)
# 创建章节文件夹
section_link = html.select('div.subBookList ul li a')
section_link.reverse()
for index,section in enumerate(section_link):
section_name = section.text
section_path = manga_path + '/' + str(index) + '-' + section_name
if not os.path.exists(section_path):
os.makedirs(section_path)
# 获取章节链接
link_list = html.select('div.subBookList ul li a')
link_list.reverse()
for index,link in enumerate(link_list):
link_section = link['href']
fullurl = 'http://www.omanhua.com' + link_section
section_path = manga_path + '/' + str(index) + '-' + link.text
print(section_path)
getManga(fullurl,section_path)
def getManga(fullurl,section_path):
print(fullurl)
# 获取最大页数
response = requests.get(fullurl)
response.encoding = 'utf-8'
html = response.text
max_pat = re.compile('id="page".*?span>/(\d+)',re.S)
# 获取章节链接
res = max_pat.search(html)
if res is not None:
max_page = res.group(1)
for i in range(1,int(max_page) + 1):
page_fullurl = fullurl + 'index.html?p=' + str(i)
getMangaPage(page_fullurl,section_path)
else:
print('最大页数获取失败')
# 下载漫画
def getMangaPage(fullurl,section_path):
phantom.get(fullurl)
time.sleep(0.1)
html = phantom.page_source
html = BeautifulSoup(html,'lxml')
img_url = html.select('img#mangaFile')[0]['src']
# 下载图片
fname = img_url.split('/')[-1]
res = img_url.split('/')
to_code = res[-2]
to_code = parse.urlencode({'':to_code}).strip('=')
res[-2] = to_code
img_url = '/'.join(res)
img_url = img_url.replace('+',' ')
response = requests.get(img_url)
# 转码
with open(section_path + '/' + fname,'wb') as f:
f.write(response.content)
if __name__ == '__main__':
getSectionLink()
phantom.quit()
# 爬取结果如下:
兄弟连学python
Python学习交流、资源共享群:563626388 QQ