爬虫框架Scrapy-2下载图片和自定义

实践2:下载周榜图片

  1. 根据上一步的数据,下载图片
  2. 多p的图片保存至一个文件夹,如下所示

实现步骤:

  1. 首先定义Items,想想需要放哪些数据在item中

    class PixivDownloadItem(scrapy.Item):
        folder_name = scrapy.Field()
        is_many = scrapy.Field()
        headers = scrapy.Field()
        final_url = scrapy.Field()
    
  2. 编写spider程序pixiv_download.py

    import scrapy
    import openpyxl
    from openpyxl.worksheet.worksheet import Worksheet
    from openpyxl.workbook.workbook import Workbook
    import os
    from mydemo.static.my_cookie import cookie, user_agent
    from scrapy import Request
    from mydemo.items import PixivDownloadItem
    
    class PixivDownloadSpider(scrapy.Spider):
        name = "pixiv_download"
        allowed_domains = ["pixiv.net"]
        url_list = list()
        custom_settings = {
            'ITEM_PIPELINES': {
                # 如果有多条数据管道,需要在这里指定
                'mydemo.pipelines.PixivDownloadPipeline': 500,
            }
        }
    
        def start_requests(self):
            """读取Excel链接,进行请求,得到ajax请求的地址"""
            _path = os.path.abspath(os.path.dirname(__file__))
            parent_dir = os.path.dirname(_path)
            workbook = openpyxl.load_workbook(f"{parent_dir}/output/pixiv_weekly_rank数据.xlsx")  # type: Workbook
            worksheet = workbook["weekly"]  # type: Worksheet
    
            for row_num in range(2, worksheet.max_row + 1):
                url_obj = {
                    "pic_name": f"{worksheet[f'A{row_num}'].value}_{worksheet[f'B{row_num}'].value}",
                    "download_url": f"https://www.pixiv.net/ajax/illust/{worksheet[f'C{row_num}'].value}/pages?lang=zh",
                    "referer": worksheet[f"D{row_num}"].value
                }
                PixivDownloadSpider.url_list.append(url_obj)
            workbook.close()
    
            for url_obj in PixivDownloadSpider.url_list:
                header = {
                    'User-Agent': user_agent,
                    'Cookie': cookie,
                    'referer': url_obj['referer']
                }
                yield Request(url=url_obj["download_url"],
                              headers=header,
                              callback=self.parse,
                              meta={"headers": header, "pic_name": url_obj["pic_name"]})
    
        def parse(self, response, **kwargs):
            """将得到的下载链接和其他信息进行打包到item,让Pipeline进行处理"""
            datas = response.json()["body"]
            headers = response.meta["headers"]
            pic_name = response.meta["pic_name"]
            is_many = False
            index = 1
            if len(datas) > 1:
                is_many = True
            item = {
                "folder_name": "",
                "is_many": is_many,
                "headers": headers,
                "final_urls": [
                    # {"title": pic_name, "url": "", "file_type": ""} 结构参考
                ]
            }
            for data in datas:
                img_p = {
                    "title": pic_name,
                    "url": data["urls"]["original"],
                    "file_type": data["urls"]["original"].split(".")[-1]
                }
                if is_many:
                    item["folder_name"] = pic_name
                    img_p["title"] = f"p{index}"
                    index += 1
                item["final_urls"].append(img_p)
            yield PixivDownloadItem(item)
    

    在mydemo/static/my_cookie.py中,定义了cookie和agent

    cookie = "111222444"
    user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) " \
                 "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
    
  3. 在Pipeline中定义如何下载和保存

    import openpyxl
    from mydemo.items import DoubanItem, PixivItem, PixivDownloadItem
    import os
    import requests
    
    class PixivDownloadPipeline:
        def __init__(self):
            self.root_path = os.path.abspath(os.path.dirname(__file__))
    
        def process_item(self, item: PixivDownloadItem, spider):
            """对item进行处理,如果是多p就新建文件夹放进去"""
            if item["is_many"]:
                save_path = f"{self.root_path}/output/{item['folder_name']}/"
            else:
                save_path = f"{self.root_path}/output/"
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            for img in item["final_urls"]:
                file_name = f"{img['title']}.{img['file_type']}"
                resp = requests.get(img["url"], headers=item["headers"])
                with open(f"{save_path}{file_name}", 'wb') as file:
                    file.write(resp.content)
            return item
    
  4. 修改settings.py ,增加数据管道

    ITEM_PIPELINES = {
        "mydemo.pipelines.DoubanItemPipeline": 300,
        "mydemo.pipelines.PixivPipeline": 400,
        "mydemo.pipelines.PixivDownloadPipeline": 500
    }
    

最终运行:

scrapy crawl pixiv_download

实践3:使用自定义ImagePipeline下载图片

Scrapy框架提供了ImagePipeline模块来进行下载图片,但要实现重命名,分文件夹等操作,就需要进行一些自定义了。具体实现步骤如下:

  1. 首先,Item和上一个例子一样,这里就不做改变了,还是使用如下item

    class PixivDownloadItem(scrapy.Item):
        folder_name = scrapy.Field()
        is_many = scrapy.Field()
        headers = scrapy.Field()
        final_urls = scrapy.Field()
    
  2. Spider程序也和上面完全相同,不同只是改了个名字,指定了新的Pipeline

    class PixivNewSpider(scrapy.Spider):
        name = "pixiv_new"
        allowed_domains = ["pixiv.net"]
        url_list = list()
        custom_settings = {
            'ITEM_PIPELINES': {
                # 指定了新的Pipeline
                'mydemo.pipelines.PixivImagePipeline': 600,
            }
        }
    
  3. 重点是定义了一个继承自ImagesPipeline的类PixivImagePipeline来对Item进行处理

    class PixivImagePipeline(ImagesPipeline):
        root_path = os.path.abspath(os.path.dirname(__file__))
    
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs )
            self.workbook = openpyxl.Workbook()
            self.ws_success = self.workbook.active
            self.ws_success.title = "成功记录"
            self.ws_success.append(('文件夹', '文件名', '下载状态', '下载链接'))
            self.ws_fail = self.workbook.create_sheet("失败记录")
            self.ws_fail.append(('文件夹', '文件名', '下载状态', '下载链接'))
    
        # def open_spider(self, spider):
    
        def get_media_requests(self, item: PixivDownloadItem, info):
            for img in item["final_urls"]:
                url = img["url"]
                headers = item["headers"]
                file_name = f"{img['title']}.{img['file_type']}"
                if item["is_many"]:
                    save_path = f"{PixivImagePipeline.root_path}/output/{item['folder_name']}"
                else:
                    save_path = f"{PixivImagePipeline.root_path}/output"
                yield Request(url, headers=headers, meta={"file_name": file_name,
                                                          "save_path": save_path
                                                          })
    
        def file_path(self, request, response=None, info=None, *, item=None):
            """返回保存的完整路径,例如static/output/aaa.jpg"""
            file_name = request.meta["file_name"]
            save_path = request.meta["save_path"]
            if not os.path.exists(save_path):
                os.makedirs(save_path)
            filename = f'{save_path}/{file_name}'  # 构建图片保存路径和名称
            return filename
    
        def item_completed(self, results, item, info):
            """对下载完成的图片进行处理,这里的item是单个的,
            这里results有多个,是因为一个item中可能会有多个下载链接。
            每次对单个item中的所有下载链接处理完,就会调用一次item_completed"""
            for is_ok, result in results:
                # results是一个元组,(下载是否成功,dic)
                if is_ok:
                    row = []
                    for img in item["final_urls"]:
                        if result["url"] == img["url"]:
                            row = [item["folder_name"], img["title"], "下载成功", result["url"]]
                    self.ws_success.append(row)
                else:
                    for img in item["final_urls"]:
                        fail_row = [item["folder_name"], img["title"], "下载失败", img["url"]]
                        self.ws_fail.append(fail_row)
                    raise DropItem('Image Downloaded Failed')
            return item
    
        def close_spider(self, spider):
            # 获取根目录路径
            root_path = os.path.abspath(os.path.dirname(__file__))
            self.workbook.save(f'{root_path}/output/pixiv_weekly_下载情况.xlsx')
    
  4. 最后别忘了在settings.py里添加管道

    ITEM_PIPELINES = {
        "mydemo.pipelines.DoubanItemPipeline": 300,
        "mydemo.pipelines.PixivPipeline": 400,
        "mydemo.pipelines.PixivDownloadPipeline": 500,
        "mydemo.pipelines.PixivImagePipeline": 600,
    }
    

    最终运行命令:

    scrapy crawl pixiv_new
    
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值