实践2:下载周榜图片
- 根据上一步的数据,下载图片
- 多p的图片保存至一个文件夹,如下所示
实现步骤:
-
首先定义Items,想想需要放哪些数据在item中
class PixivDownloadItem(scrapy.Item): folder_name = scrapy.Field() is_many = scrapy.Field() headers = scrapy.Field() final_url = scrapy.Field()
-
编写spider程序pixiv_download.py
import scrapy import openpyxl from openpyxl.worksheet.worksheet import Worksheet from openpyxl.workbook.workbook import Workbook import os from mydemo.static.my_cookie import cookie, user_agent from scrapy import Request from mydemo.items import PixivDownloadItem class PixivDownloadSpider(scrapy.Spider): name = "pixiv_download" allowed_domains = ["pixiv.net"] url_list = list() custom_settings = { 'ITEM_PIPELINES': { # 如果有多条数据管道,需要在这里指定 'mydemo.pipelines.PixivDownloadPipeline': 500, } } def start_requests(self): """读取Excel链接,进行请求,得到ajax请求的地址""" _path = os.path.abspath(os.path.dirname(__file__)) parent_dir = os.path.dirname(_path) workbook = openpyxl.load_workbook(f"{parent_dir}/output/pixiv_weekly_rank数据.xlsx") # type: Workbook worksheet = workbook["weekly"] # type: Worksheet for row_num in range(2, worksheet.max_row + 1): url_obj = { "pic_name": f"{worksheet[f'A{row_num}'].value}_{worksheet[f'B{row_num}'].value}", "download_url": f"https://www.pixiv.net/ajax/illust/{worksheet[f'C{row_num}'].value}/pages?lang=zh", "referer": worksheet[f"D{row_num}"].value } PixivDownloadSpider.url_list.append(url_obj) workbook.close() for url_obj in PixivDownloadSpider.url_list: header = { 'User-Agent': user_agent, 'Cookie': cookie, 'referer': url_obj['referer'] } yield Request(url=url_obj["download_url"], headers=header, callback=self.parse, meta={"headers": header, "pic_name": url_obj["pic_name"]}) def parse(self, response, **kwargs): """将得到的下载链接和其他信息进行打包到item,让Pipeline进行处理""" datas = response.json()["body"] headers = response.meta["headers"] pic_name = response.meta["pic_name"] is_many = False index = 1 if len(datas) > 1: is_many = True item = { "folder_name": "", "is_many": is_many, "headers": headers, "final_urls": [ # {"title": pic_name, "url": "", "file_type": ""} 结构参考 ] } for data in datas: img_p = { "title": pic_name, "url": data["urls"]["original"], "file_type": data["urls"]["original"].split(".")[-1] } if is_many: item["folder_name"] = pic_name img_p["title"] = f"p{index}" index += 1 item["final_urls"].append(img_p) yield PixivDownloadItem(item)
在mydemo/static/my_cookie.py中,定义了cookie和agent
cookie = "111222444" user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) " \ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.159 Safari/537.36"
-
在Pipeline中定义如何下载和保存
import openpyxl from mydemo.items import DoubanItem, PixivItem, PixivDownloadItem import os import requests class PixivDownloadPipeline: def __init__(self): self.root_path = os.path.abspath(os.path.dirname(__file__)) def process_item(self, item: PixivDownloadItem, spider): """对item进行处理,如果是多p就新建文件夹放进去""" if item["is_many"]: save_path = f"{self.root_path}/output/{item['folder_name']}/" else: save_path = f"{self.root_path}/output/" if not os.path.exists(save_path): os.makedirs(save_path) for img in item["final_urls"]: file_name = f"{img['title']}.{img['file_type']}" resp = requests.get(img["url"], headers=item["headers"]) with open(f"{save_path}{file_name}", 'wb') as file: file.write(resp.content) return item
-
修改settings.py ,增加数据管道
ITEM_PIPELINES = { "mydemo.pipelines.DoubanItemPipeline": 300, "mydemo.pipelines.PixivPipeline": 400, "mydemo.pipelines.PixivDownloadPipeline": 500 }
最终运行:
scrapy crawl pixiv_download
实践3:使用自定义ImagePipeline下载图片
Scrapy框架提供了ImagePipeline模块来进行下载图片,但要实现重命名,分文件夹等操作,就需要进行一些自定义了。具体实现步骤如下:
-
首先,Item和上一个例子一样,这里就不做改变了,还是使用如下item
class PixivDownloadItem(scrapy.Item): folder_name = scrapy.Field() is_many = scrapy.Field() headers = scrapy.Field() final_urls = scrapy.Field()
-
Spider程序也和上面完全相同,不同只是改了个名字,指定了新的Pipeline
class PixivNewSpider(scrapy.Spider): name = "pixiv_new" allowed_domains = ["pixiv.net"] url_list = list() custom_settings = { 'ITEM_PIPELINES': { # 指定了新的Pipeline 'mydemo.pipelines.PixivImagePipeline': 600, } }
-
重点是定义了一个继承自ImagesPipeline的类PixivImagePipeline来对Item进行处理
class PixivImagePipeline(ImagesPipeline): root_path = os.path.abspath(os.path.dirname(__file__)) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs ) self.workbook = openpyxl.Workbook() self.ws_success = self.workbook.active self.ws_success.title = "成功记录" self.ws_success.append(('文件夹', '文件名', '下载状态', '下载链接')) self.ws_fail = self.workbook.create_sheet("失败记录") self.ws_fail.append(('文件夹', '文件名', '下载状态', '下载链接')) # def open_spider(self, spider): def get_media_requests(self, item: PixivDownloadItem, info): for img in item["final_urls"]: url = img["url"] headers = item["headers"] file_name = f"{img['title']}.{img['file_type']}" if item["is_many"]: save_path = f"{PixivImagePipeline.root_path}/output/{item['folder_name']}" else: save_path = f"{PixivImagePipeline.root_path}/output" yield Request(url, headers=headers, meta={"file_name": file_name, "save_path": save_path }) def file_path(self, request, response=None, info=None, *, item=None): """返回保存的完整路径,例如static/output/aaa.jpg""" file_name = request.meta["file_name"] save_path = request.meta["save_path"] if not os.path.exists(save_path): os.makedirs(save_path) filename = f'{save_path}/{file_name}' # 构建图片保存路径和名称 return filename def item_completed(self, results, item, info): """对下载完成的图片进行处理,这里的item是单个的, 这里results有多个,是因为一个item中可能会有多个下载链接。 每次对单个item中的所有下载链接处理完,就会调用一次item_completed""" for is_ok, result in results: # results是一个元组,(下载是否成功,dic) if is_ok: row = [] for img in item["final_urls"]: if result["url"] == img["url"]: row = [item["folder_name"], img["title"], "下载成功", result["url"]] self.ws_success.append(row) else: for img in item["final_urls"]: fail_row = [item["folder_name"], img["title"], "下载失败", img["url"]] self.ws_fail.append(fail_row) raise DropItem('Image Downloaded Failed') return item def close_spider(self, spider): # 获取根目录路径 root_path = os.path.abspath(os.path.dirname(__file__)) self.workbook.save(f'{root_path}/output/pixiv_weekly_下载情况.xlsx')
-
最后别忘了在settings.py里添加管道
ITEM_PIPELINES = { "mydemo.pipelines.DoubanItemPipeline": 300, "mydemo.pipelines.PixivPipeline": 400, "mydemo.pipelines.PixivDownloadPipeline": 500, "mydemo.pipelines.PixivImagePipeline": 600, }
最终运行命令:
scrapy crawl pixiv_new