因为需要用到scrapy图片爬取的中间键,故应先安装PIL:
pip install pillow
settings.py:
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
IMAGES_URLS_FIELD = 'url' #为items中存储图片链接的字段
IMAGES_STORE = r'.' #为存储图片路径
items.py:
import scrapy
class SinaTripItem(scrapy.Item):
url = scrapy.Field()
主爬虫文件:
import scrapy
from scrapy.spiders import Spider
from scrapy.selector import Selector
from sina_trip.items import SinaTripItem
class sinaTripSpider(Spider):
name = "sinaTripSpider" #name of Spider
start_urls = ["http://travel.sina.com.cn/"] #start url
def parse(self, response): #parse function
item = SinaTripItem()
sel = Selector(response)
sites = sel.xpath("//img/@src").extract() #extract url of pictures
item['url'] = []
for site in sites:
if 'http:' not in site:
site = 'http:' + site
item['url'].append(site)
yield item
过程中所遇问题:
ValueError: Missing scheme in request url: h
解决办法:相关URL必须是一个List,所以遇到该错误只需要将url转换成list即可。
运用Pipelines做后期处理(数据清洗、验证、过滤等):
# -*- coding: utf-8 -*-
from scrapy.pipelines.images import ImagesPipeline
from scrapy.http import Request
class DoubanmoviePipeline(object):
def process_item(self, item, spider):
return item
class MyImagesPipeline(ImagesPipeline):
# yield meta for file_path() function
def get_media_requests(self, item, info):
for url in item['url']:
yield Request(url, meta={'item': item, 'index':item['url'].index(url)})
# rename the image
def file_path(self, request, response=None, info=None):
item = request.meta['item']
index = request.meta['index']
image_name = item['img_name'][index]
return 'full/%s.jpg' % (image_name)