import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import DoubanMovieItem
class CrawlImageSpider(CrawlSpider):
name = "crawl_image"
allowed_domains = ["movie.douban.com"]
start_urls = ["https://movie.douban.com/top250"]
rules = (Rule(LinkExtractor(allow='^https://movie.douban.com/subject/\d+/$'), callback="parse_item", follow=False),
Rule(LinkExtractor(restrict_xpaths='//span[@class="next"]/a'), follow=True),)
def parse_item(self, response):
# 实例化items对象
items = DoubanMovieItem()
# 筛选每部电影的海报图链接,并使用items提交到图片管道
image_url = response.xpath('//div[@id="mainpic"]/a/img/@src').get()
# 筛选每部电影的名称
image_name = response.xpath('//span[@property="v:itemreviewed"]/text()').get()
# 电影名称中如果有英文的冒号会导致图片下载失败,这里需要将冒号进行字符串替换
image_name = image_name.replace(':', ':')
# 将图片链接字符串以列表的形式发送给图片字段,因为image_urls字段需要存储一个包含图片链接的列表
items['image_urls'] = [image_url]
# 将筛选出的图片名字传给items对象
items['image_name'] = image_name
print(f'筛选的电影信息为:《{image_name}》 - {image_url}')
# 将获取的电影海报图提交给图片管道,需要经过items.py
return items
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
# useful for handling different item types with a single interface
from itemadapter import ItemAdapter
import pymongo
class DoubanMoviePipeline:
# 使用init来初始化mongodb数据库的指定
def __init__(self):
# 连接mongodb数据库中的python_test数据库,并指定movie_top集合
self.client = pymongo.MongoClient()
self.db = self.client['python_test']
self.collection = self.db['movie_top']
def process_item(self, item, spider):
print(item)
self.collection.insert_one(item)
return item
from scrapy import Request
from itemadapter import ItemAdapter
from scrapy.pipelines.images import ImagesPipeline
# 自定义图片管道类
class DoubanImagePipeline(ImagesPipeline):
# 重写get_media_requests方法发送请求下载图片
def get_media_requests(self, item, info):
# 通过ItemAdapter来获取图片管道的image_urls字段
urls = ItemAdapter(item).get(self.images_urls_field, [])
# 遍历出image_urls字段中的链接,并发送下载图片的请求
return [Request(u, meta={'image': item}) for u in urls]
# 重写file_path方法来指定图片的保存位置和名称
def file_path(self, request, response=None, info=None, *, item=None):
# image_guid = hashlib.sha1(to_bytes(request.url)).hexdigest()
# 获取请求的图片数据
item = request.meta.get('image')
# 获取请求的图片链接的图片名
image_name = item['image_name']
# 设置保存的文件夹和文件名
return f"images/{image_name}.jpg"