Python爬虫使用实例-漫kzhan

最新推荐文章于 2024-10-05 08:57:27 发布

镜花照无眠

最新推荐文章于 2024-10-05 08:57:27 发布

阅读量1.4k

点赞数 35

分类专栏： # Python Python爬虫专栏文章标签： python 爬虫开发语言

本文链接：https://blog.csdn.net/weixin_45693567/article/details/141905815

版权

Python 同时被 2 个专栏收录

34 篇文章 2 订阅

订阅专栏

Python爬虫专栏

8 篇文章 0 订阅

订阅专栏

环境配置

pip install shutil parsel pillow pypdf

1_/ 单个章节

singleChapter

需要获取参数：chapter_id与comic_id，可能要sign和uid

获取请求地址
在这里插入图片描述

url='https://comic.mkzhan.com/chapter/content/v1/' # 请求地址

获取请求参数
在这里插入图片描述

data={
    'chapter_id':'499715',
    'comic_id':'209405',
    'format':'1',
    'quality':'1',
    'sign':'0',
    'type':'1',
    'uid':'0',
}

在这里插入图片描述

for index in response.json()['data']['page']:
    img_url=index['image']

实现代码：

# 单个章节
import requests
url='https://comic.mkzhan.com/chapter/content/v1/' # 请求地址
data={
    #'chapter_id':'997698',
    'chapter_id':'639648',
    'comic_id':'211604',
    'format':'1',
    'quality':'1',
    'sign':'bf511db7ee8e01fd18a888b0039cfefa',
    'type':'1',
    'uid':'75377874',
}
# 模拟伪装
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
# 发送请求
response = requests.get(url=url, params=data, headers=headers)
# print(response) # <Response [200]>
# print(response.json())
img_name = 1
for index in response.json()['data']['page']:
    # print(index)
    img_url=index['image']
    # print(img_url)
    img_content = requests.get(url=img_url, headers = headers).content
    with open('output\\' + str(img_name)+'.jpg',mode='wb') as f:
        f.write(img_content)
    img_name += 1

2_/ 合成长图

longPicture
需要获取参数：chapter_id与comic_id，可能要sign和uid

用 Python 的 os 模块来检查文件夹是否存在，如果不存在，则创建它。

# 检查文件夹是否存在，若不存在则创建
if not os.path.exists(folder_path):
    os.makedirs(folder_path)

合成长图，要将文件夹中的所有jpg图片合并成一张长图，可以使用 python 的 pillow 库。读取指定文件夹中的所有jpg图片，并将它们依次合并成一张长图。

导入库：os库用于文件处理，Image 用于图片操作。读取指定文件夹中的所有.jpg 图片，并按名称排序。
排序: 在 images.sort() 中，将排序方式更改为通过提取文件名中的数字进行排序。 key=lambda x: int(os.path.splitext(x)[0]) 先去掉文件扩展名，再将其转换为整数进行排序。使用 img[:-4].isdigit() 只保留文件名的数字部分。

# 按数字顺序排序，提取数字，然后排序
# 只保留文件名是数字的文件
images = [img for img in images if img[:-4].isdigit()]  
images.sort(key=lambda x: int(os.path.splitext(x)[0]))

计算：算所有图片的总高度和最大宽度，以便创建合成图像。
创建新图像: 使用 Image.new 创建一张新的空白图像。逐一将读取的图片粘贴到新图像中。
保存图像: 将合成后的图像保存到指定路径。
生成长图后用 os.remove() 函数删除原始图片（仅保留merged.jpg）

实现代码：

# 单个章节并合成长图
import os
from PIL import Image
import requests
url='https://comic.mkzhan.com/chapter/content/v1/' # 请求地址
data={
    'chapter_id':'639633',
    'comic_id':'211604',
    'format':'1',
    'quality':'1',
    'type':'1',
} # 请求参数
# 模拟伪装
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
# 发送请求
response = requests.get(url=url, params=data, headers=headers)
# print(response) # <Response [200]>
# print(response.json())
# 获取
def get_img():
    img_name = 1
    # 检查文件夹是否存在，若不存在则创建
    folder = 'output\\'
    if not os.path.exists(folder):
        os.makedirs(folder)
    for index in response.json()['data']['page']:
        # print(index)
        img_url = index['image']
        # print(img_url)
        img_content = requests.get(url=img_url, headers=headers).content
        with open(folder + str(img_name) + '.jpg', mode='wb') as f:
            f.write(img_content)
        img_name += 1
        
# 合并当前章节的图片为长图, 按顺序
def merge_images_vertically_in_order():
    image_folder = 'output\\'
    # 获取文件夹中的所有 JPG 文件
    images = [img for img in os.listdir(image_folder) if img.endswith('.jpg')]

    # 按数字顺序排序，提取数字，然后排序
    images = [img for img in images if img[:-4].isdigit()]  # 只保留文件名是数字的文件
    images.sort(key=lambda x: int(os.path.splitext(x)[0]))

    # 打开所有图片并获取它们的宽度和高度
    image_objects = [Image.open(os.path.join(image_folder, img)) for img in images]

    # 计算最终长图的总高度和最大宽度
    total_height = sum(img.height for img in image_objects)
    max_width = max(img.width for img in image_objects)

    # 创建一张新的空白图像，用于存放合成的长图
    new_image = Image.new('RGB', (max_width, total_height))

    # 逐个将图片粘贴到新图像上
    current_height = 0
    for img in image_objects:
        new_image.paste(img, (0, current_height))
        current_height += img.height

    # 保存合成的长图
    new_image.save('output\\merged.jpg')
    print(f'合成的长图已保存为: output\\merged.jpg')

    # 移除原始图片
    for img in images:
        os.remove(os.path.join(image_folder, img))
    print(f'原始图片已删除.')

# 使用示例
get_img()
# merge_images_vertically()
merge_images_vertically_in_order()

3_/ 全部章节

multiChapter
需要获取参数：comic_id，可能要sign和uid
合并为长图，并合并长图为pdf

把长图放在图的上一级便于取用：用 Python 的 shutil 模块中的 move() 函数。将指定路径下的文件移动到其上一级文件夹。

shutil.move(long_img_path, os.path.join(main_folder, f'{chapter_name}.png'))

使用 PyPDF2（或 pypdf）库来将文件夹中的图片按顺序（如果有最终话或者最后话的话放在最后面，序章则放在最前面）合并成 PDF.

实现代码：

# 当然文件名之类的可能还需要修改一下
import os
import shutil
import requests
import parsel
from PIL import Image
from pypdf import PdfWriter
url1='https://www.mkzhan.com/214829/' # 请求地址
# 模拟伪装
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
html_data=requests.get(url=url1,headers=headers).text
#print(html_data)
# css 数据解析
selector = parsel.Selector(html_data)
list = selector.css('.chapter__list .chapter__list-box .chapter__item')

long_images = [] # 用于存放所有章节长图的列表

# 创建主文件夹
main_folder = 'output\\XⅪ.Awaken'
if not os.path.exists(main_folder):
    os.makedirs(main_folder)
# 获取数据
def get_data():
    # for i in list(reversed(list)):
    for i in list:
        chapter_id = i.css('a::attr(data-chapterid)').get()
        chapter_name = i.css('a::text').getall()[-1].strip()

        print(chapter_id, chapter_name)

        # 创建章节文件夹
        chapter_folder = os.path.join(main_folder, chapter_name)
        if not os.path.exists(chapter_folder):
            os.makedirs(chapter_folder)

        # 请求参数
        data = {
            'chapter_id': chapter_id,
            'comic_id': '214829',  # 此
            'format': '1',
            'quality': '1',
            'type': '1',
        }

        # 发送请求
        url = 'https://comic.mkzhan.com/chapter/content/v1/'
        response = requests.get(url=url, params=data, headers=headers)

        img_name = 1
        images = []

        for index in response.json()['data']['page']:
            img_url = index['image']
            img_content = requests.get(url=img_url, headers=headers).content

            # 确保 chapter_name 是一个有效的文件夹名称
            chapter_name = chapter_name.replace('/', '-').replace('\\', '-').replace(':', '-').replace('*',
                                                                                                            '-').replace(
                '?', '').replace('"', '').replace('<', '').replace('>', '').replace('|', '')

            # 图片文件路径
            img_file_path = os.path.join(chapter_folder, f'{chapter_name}_{img_name}.png')
            # 检查图是否已存在
            if os.path.exists(img_file_path):
                print(f"图 {img_file_path} 已存在。")
            else:
                with open(img_file_path, mode='wb') as f:
                    f.write(img_content)

            images.append(img_file_path)
            img_name += 1

        # 合并当前章节的图片为长图
        if images:
            total_height = 0
            max_width = 0
            images_to_merge = []

            for img_path in images:
                img = Image.open(img_path)
                total_height += img.height
                max_width = max(max_width, img.width)
                images_to_merge.append(img)

            long_img = Image.new('RGB', (max_width, total_height))
            current_height = 0

            for img in images_to_merge:
                long_img.paste(img, (0, current_height))
                current_height += img.height

            long_img_path = os.path.join(chapter_folder, f'{chapter_name}.png')
            # 检查长图是否已存在
            if os.path.exists(long_img_path):
                print(f"长图 {long_img_path} 已存在。")
            else:
                long_img.save(long_img_path)
                long_images.append(long_img_path)

            long_img.save(long_img_path)
            long_images.append(long_img_path)
            # 移动到上一级, 即 image_folder = 'output\\XⅪ.Awaken\\'
            shutil.move(long_img_path, os.path.join(main_folder, f'{chapter_name}.png'))
# 设置图片文件夹路径和输出 PDF 文件路径
def merged_pdf():
    #image_folder = 'XⅪ.Awaken\\'  # 替换为你的图片文件夹路径
    #output_pdf_path = 'XⅪ.Awaken.pdf'  # 输出 PDF 文件路径
    image_folder = 'output\\XⅪ.Awaken\\'  # 替换为你的图片文件夹路径
    output_pdf_path = 'output\\XⅪ.Awaken\\XⅪ.Awaken.pdf'  # 输出 PDF 文件路径

    # 用于存放所有打开的图片及其标题
    images = []
    titles = []

    # 遍历文件夹中的所有图片
    for image_file in os.listdir(image_folder):
        if image_file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):  # 检查文件格式
            image_path = os.path.join(image_folder, image_file)
            img = Image.open(image_path)
            images.append(img)

            # 提取图片标题（不带扩展名）
            title = os.path.splitext(image_file)[0]
            titles.append(title)


    # 自定义排序函数
    def custom_sort_key(title):
        if "序章" in title:  # 序章放在最前
            return (0, title)
        elif "最终话" in title or "最后话" or "新oc" in title:  # 最终话放在最后
            return (2, title)
        else:
            # 提取话的数字，并放在第一排序位置
            number_part = ''.join(filter(str.isdigit, title))  # 提取数字部分
            return (1, int(number_part) if number_part.isdigit() else 0, title)


    # 根据自定义排序规则排序标题和图片
    sorted_indices = sorted(range(len(titles)), key=lambda i: custom_sort_key(titles[i]))
    images = [images[i] for i in sorted_indices]

    # 创建 PDF Writer 实例
    pdf_writer = PdfWriter()

    # 将每张图像添加到 PDF
    for img in images:
        img_pdf_path = os.path.join(image_folder, f"temp_{titles[images.index(img)]}.pdf")
        img.save(img_pdf_path, "PDF", quality=100)

        # 添加保存的 PDF 文件到 writer
        pdf_writer.append(img_pdf_path)

    # 保存生成的 PDF 文件
    with open(output_pdf_path, 'wb') as f:
        pdf_writer.write(f)

    # 清理临时文件
    for title in titles:
        os.remove(os.path.join(image_folder, f"temp_{title}.pdf"))

    print(f'PDF 文件已生成：{output_pdf_path}')


get_data()
merged_pdf()

4_/ 可选章节

OptionalmultiChapter

下载comic, def get_data(start=0, end=None) 可指定章节, 通过改变参数start和end

可能要注意一下顺序, 而且有番外
在这里插入图片描述
倒序从前往后,前面的为0 正序则从后往前, 后面的为0
因为chapter_name = i.css('a::text').getall()[-1].strip() [-1]是取列表的最后一个元素, 若要reversed 可以改为[0] 此处不可
这里不能用reversed, 会报错 for i in list(reversed(list))[start:end]:

TypeError: ‘SelectorList’ object is not callable

改start值吧, 若倒序第十话 start=9 若正序第十话 start=len(list)-10
第一话start=len(list)-1 end=None也即end = len(list)

import os
import shutil
import requests
import parsel
from PIL import Image
from pypdf import PdfWriter
url1='https://www.mkzhan.com/209405/' # 请求地址
# 模拟伪装
headers={'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/98.0.4758.139 Safari/537.36'}
html_data=requests.get(url=url1,headers=headers).text
#print(html_data)
# css 数据解析
selector = parsel.Selector(html_data)
list = selector.css('.chapter__list .chapter__list-box .chapter__item')
# print(type(list))
long_images = [] # 用于存放所有章节长图的列表

# 创建主文件夹
# main_folder = 'XⅪ.Awaken'
main_folder = 'output\\非人哉'
if not os.path.exists(main_folder):
    os.makedirs(main_folder)

# 获取数据
# start: 指定开始章节的索引（默认为0）。
# end: 指定结束章节的索引（默认为None，表示获取到列表的最后一章）。
# 例如 get_data(start=0, end=10) 会获取前10章。
# get_data(start=5) 会从第5章开始获取到最后一章。
def get_data(start=len(list)-10, end=None): # 第10话, 这个正序的, 底部更新, 一般倒序的
    # 假设 list 是从某个地方获取的章节列表
    # list = get_chapter_list()  # 这里需要你自己实现获取章节列表的逻辑

    if end is None:
        end = len(list)  # 如果没有指定结束位置，默认为列表的长度

    for i in list[start:end]:  # 根据 start 和 end 的值获取章节
        #chapter_id = i.css('a::attr(data-chapterid)').get()
        #chapter_name = i.css('a::text').getall()[-1].strip()
        chapter_id = i.css('a::attr(data-chapterid)').get()
        chapter_name = i.css('a::text').getall()[-1].strip()
        # chapter_name = i.css('a::text').getall()[0].strip()
        # [0]和[-1]结果一样, 原因：HTML 结构简单, 在你处理的特定 HTML 结构中每个 <a> 标签只包含了一个完整的文本节点，
        # 因此取第一个或最后一个节点都返回相同的字符串。

        print(chapter_id, chapter_name)

        # 创建章节文件夹
        chapter_folder = os.path.join(main_folder, chapter_name)
        if not os.path.exists(chapter_folder):
            os.makedirs(chapter_folder)

        # 请求参数
        data = {
            'chapter_id': chapter_id,
            'comic_id': 209405,  # 此
            'format': '1',
            'quality': '1',
            'type': '1',
        }

        # 发送请求
        url = 'https://comic.mkzhan.com/chapter/content/v1/'
        response = requests.get(url=url, params=data, headers=headers)

        img_name = 1
        images = []

        for index in response.json()['data']['page']:
            img_url = index['image']
            print(img_url)
            img_content = requests.get(url=img_url, headers=headers).content

            # 确保 chapter_name 是一个有效的文件夹名称
            chapter_name = chapter_name.replace('/', '-').replace('\\', '-').replace(':', '-').replace('*',
                                                                                                       '-').replace('?',
                                                                                                                    '').replace(
                '"', '').replace('<', '').replace('>', '').replace('|', '')

            # 图片文件路径
            img_file_path = os.path.join(chapter_folder, f'{chapter_name}_{img_name}.png')
            # 检查图是否已存在
            if os.path.exists(img_file_path):
                print(f"图 {img_file_path} 已存在。")
            else:
                with open(img_file_path, mode='wb') as f:
                    f.write(img_content)

            images.append(img_file_path)
            img_name += 1

            # 合并当前章节的图片为长图
        if images:
            total_height = 0
            max_width = 0
            images_to_merge = []

            for img_path in images:
                img = Image.open(img_path)
                total_height += img.height
                max_width = max(max_width, img.width)
                images_to_merge.append(img)

            long_img = Image.new('RGB', (max_width, total_height))
            current_height = 0

            for img in images_to_merge:
                long_img.paste(img, (0, current_height))
                current_height += img.height

            long_img_path = os.path.join(chapter_folder, f'{chapter_name}.png')
            # 检查长图是否已存在
            if os.path.exists(long_img_path):
                print(f"长图 {long_img_path} 已存在。")
            else:
                long_img.save(long_img_path)
                long_images.append(long_img_path)

            long_img.save(long_img_path)
            long_images.append(long_img_path)
            # 移动到上一级, 即 image_folder = 'output\\XⅪ.Awaken\\'
            shutil.move(long_img_path, os.path.join(main_folder, f'{chapter_name}.png'))

# 设置图片文件夹路径和输出 PDF 文件路径
def merged_pdf():
    # image_folder = 'XⅪ.Awaken\\'  # 替换为你的图片文件夹路径
    # output_pdf_path = 'XⅪ.Awaken.pdf'  # 输出 PDF 文件路径
    image_folder = 'output\\非人哉\\'  # 替换为你的图片文件夹路径
    output_pdf_path = 'output\\非人哉\\非人哉.pdf'  # 输出 PDF 文件路径

    # 用于存放所有打开的图片及其标题
    images = []
    titles = []

    # 遍历文件夹中的所有图片
    for image_file in os.listdir(image_folder):
        if image_file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):  # 检查文件格式
            image_path = os.path.join(image_folder, image_file)
            img = Image.open(image_path)
            images.append(img)

            # 提取图片标题（不带扩展名）
            title = os.path.splitext(image_file)[0]
            titles.append(title)

    # 自定义排序函数
    def custom_sort_key(title):
        if "序章" in title:  # 序章放在最前
            return (0, title)
        elif "最终话" in title or "最后话" or "新oc" in title:  # 最终话放在最后
            return (2, title)
        else:
            # 提取话的数字，并放在第一排序位置
            number_part = ''.join(filter(str.isdigit, title))  # 提取数字部分
            return (1, int(number_part) if number_part.isdigit() else 0, title)

    # 根据自定义排序规则排序标题和图片
    sorted_indices = sorted(range(len(titles)), key=lambda i: custom_sort_key(titles[i]))
    images = [images[i] for i in sorted_indices]

    # 创建 PDF Writer 实例
    pdf_writer = PdfWriter()

    # 将每张图像添加到 PDF
    for img in images:
        img_pdf_path = os.path.join(image_folder, f"temp_{titles[images.index(img)]}.pdf")
        img.save(img_pdf_path, "PDF", quality=100)

        # 添加保存的 PDF 文件到 writer
        pdf_writer.append(img_pdf_path)

    # 保存生成的 PDF 文件
    with open(output_pdf_path, 'wb') as f:
        pdf_writer.write(f)

    # 清理临时文件
    for title in titles:
        os.remove(os.path.join(image_folder, f"temp_{title}.pdf"))

    print(f'PDF 文件已生成：{output_pdf_path}')

get_data()
# merged_pdf()