（70）--爬取哦漫画图片并下载到相应文件夹

最新推荐文章于 2021-05-30 21:02:40 发布

Fredreck1919

最新推荐文章于 2021-05-30 21:02:40 发布

阅读量687

点赞数

分类专栏： Python爬虫文章标签：哦漫画

本文链接：https://blog.csdn.net/Fredreck1919/article/details/79857072

版权

Python爬虫专栏收录该内容

17 篇文章 0 订阅

订阅专栏

# 爬取哦漫画图片并下载到相应文件夹

from selenium import webdriver
import time
import requests
from bs4 import BeautifulSoup
import re
from urllib import request,parse
import os

# 1.获取漫画章节链接

phantom = webdriver.PhantomJS(executable_path=r'E:\Python\phantomjs-2.1.1-windows\bin\phantomjs.exe')

# 获取章节链接
def getSectionLink():
    base_url = 'http://www.omanhua.com/comic/4014/'
    response = requests.get(base_url)
    response.encoding = 'utf-8'
    html = response.text
    html = BeautifulSoup(html,'lxml')

    # 创建漫画文件夹
    # 获取漫画名称
    manga_name = html.select('div.main01_content h2')[0].text.strip('漫画简介：')
    manga_path = 'img/' + manga_name
    if not os.path.exists(manga_path):
        os.makedirs(manga_path)

    # 创建章节文件夹
    section_link = html.select('div.subBookList ul li a')
    section_link.reverse()
    for index,section in enumerate(section_link):
        section_name = section.text
        section_path = manga_path + '/' + str(index) + '-' + section_name
        if not os.path.exists(section_path):
            os.makedirs(section_path)

    # 获取章节链接
    link_list = html.select('div.subBookList ul li a')
    link_list.reverse()
    for index,link in enumerate(link_list):
        link_section = link['href']
        fullurl = 'http://www.omanhua.com' + link_section
        section_path = manga_path + '/' + str(index) + '-' + link.text
        print(section_path)
        getManga(fullurl,section_path)


def getManga(fullurl,section_path):
    print(fullurl)
    # 获取最大页数
    response = requests.get(fullurl)
    response.encoding = 'utf-8'
    html = response.text
    max_pat = re.compile('id="page".*?span>/(\d+)',re.S)


    # 获取章节链接
    res = max_pat.search(html)

    if res is not None:
        max_page = res.group(1)
        for i in range(1,int(max_page) + 1):
            page_fullurl = fullurl + 'index.html?p=' + str(i)

            getMangaPage(page_fullurl,section_path)
    else:
        print('最大页数获取失败')

# 下载漫画
def getMangaPage(fullurl,section_path):
    phantom.get(fullurl)
    time.sleep(0.1)
    html = phantom.page_source
    html = BeautifulSoup(html,'lxml')
    img_url = html.select('img#mangaFile')[0]['src']

    # 下载图片
    fname = img_url.split('/')[-1]
    res = img_url.split('/')
    to_code = res[-2]

    to_code = parse.urlencode({'':to_code}).strip('=')
    res[-2] = to_code


    img_url = '/'.join(res)
    img_url = img_url.replace('+',' ')
    response = requests.get(img_url)


    # 转码

    with open(section_path + '/' + fname,'wb') as f:
        f.write(response.content)

if __name__ == '__main__':
    getSectionLink()
    phantom.quit()

# 爬取结果如下: