我今天在练习scrapy中的Imagepipeline下载图片的时候出现了下面的报错:
swift Traceback (most recent call last):
········
File "e:\anaconda\lib\site-packages\scrapy\http\request\__init__.py", line 62, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: h
通过查询网上的资料得知这是返回的image_urls要是一个list,pipline在用list处理数据的时候要快得多,我这里写的是一个字符串,所以会报错。
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from ..items import MeiziItem
class A2Spider(scrapy.Spider):
index = 0
MAX_DOWNLOAD_NUM = 1000
name = 'a2'
allowed_domains = ['www.tupianzj.com']
start_urls = ['http://www.tupianzj.com/meinv/mm/meizitu/']
def parse(self, response):
le = LinkExtractor(restrict_xpaths='//ul[@class="d1 ico3"]/li/a')
for link in le.extract_links(response):#可以直接读取标签中的地址
yield scrapy.Request(link.url, callback=self.parse_tu)
# yield scrapy.Request('http://www.tupianzj.com/meinv/20180327/158377_27.html', callback=self.parse_tu)
def parse_tu(self, response):
tu = MeiziItem()
tu['name'] = response.xpath('//*[@id="container"]/div/div/div[2]/h1/text()').extract_first()
tu['dizhi'] = response.xpath('//*[@id="bigpicimg"]/@src').extract_first()
if tu['dizhi'] and self.index < self.MAX_DOWNLOAD_NUM:
yield {'image_urls': [tu['dizhi']]} //就是这里出错了,要返回列表
# yield tu
le2 = LinkExtractor(restrict_xpaths='//div[@class="pages"]//li[last()]')
links = le2.extract_links(response)
if links:
next_url = links[0].url
yield scrapy.Request(next_url, callback=self.parse_tu)