版权声明:原创文章,欢迎一起学习交流!
使用scrapy的ImagesPipeline爬取图片的时候,运行报错
Traceback (most recent call last): File "/home/lcy/.local/lib/python2.7/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks current.result = callback(current.result, *args, **kw) File "/home/lcy/.local/lib/python2.7/site-packages/scrapy/pipelines/media.py", line 62, in process_item requests = arg_to_iter(self.get_media_requests(item, info)) File "/home/lcy/.local/lib/python2.7/site-packages/scrapy/pipelines/images.py", line 147, in get_media_requests return [Request(x) for x in item.get(self.images_urls_field, [])] File "/home/lcy/.local/lib/python2.7/site-packages/scrapy/http/request/__init__.py", line 25, in __init__ self._set_url(url) File "/home/lcy/.local/lib/python2.7/site-packages/scrapy/http/request/__init__.py", line 57, in _set_url raise ValueError('Missing scheme in request url: %s' % self._url) ValueError: Missing scheme in request url: h
查找了相关的文档,了解到使用ImagesPipeline传入的url地址必须是一个list,在传入一个list的时候pipeline处理的速度要快得多,而我写的是一个字符串,所以报错,所以我们需要修改一下传入的url格式就行了
源码附上:
修改前:
# -*- coding: utf-8 -*- import scrapy from imgspider.items import QiubaiPicItem import sys reload(sys) sys.setdefaultencoding( "utf-8" ) class QiubaipicSpider(scrapy.Spider): name = "qiubaiPic" allowed_domains = ["qiushibaike.com"] start_urls = ['http://qiushibaike.com/'] def parse(self, response): # page_value=response.xpath('//*[@id="content-left"]/ul/li[8]/a/span/text()').extract()[0] # for page in range(1,int(page_value)): # url='http://www.qiushibaike.com/pic/page/'+str(page) # yield scrapy.Request(url,callback=self.parse_detail) url='http://www.qiushibaike.com/pic/page/3' yield scrapy.Request(url,callback=self.parse_detail) def parse_detail(self,response): item=[] divs=response.xpath('//*[@id="content-left"]/div[@class="article block untagged mb15"]') for div in divs: QiubaiPic=QiubaiPicItem() src=div.xpath('div[@class="thumb"]/a/img/@src').extract()[0] img_path='http://'+src[2:] QiubaiPic['img']=img_path item.append(QiubaiPic) return item
修改后:
# -*- coding: utf-8 -*- import scrapy from imgspider.items import QiubaiPicItem import sys reload(sys) sys.setdefaultencoding( "utf-8" ) class QiubaipicSpider(scrapy.Spider): name = "qiubaiPic" allowed_domains = ["qiushibaike.com"] start_urls = ['http://qiushibaike.com/'] def parse(self, response): # page_value=response.xpath('//*[@id="content-left"]/ul/li[8]/a/span/text()').extract()[0] # for page in range(1,int(page_value)): # url='http://www.qiushibaike.com/pic/page/'+str(page) # yield scrapy.Request(url,callback=self.parse_detail) url='http://www.qiushibaike.com/pic/page/3' yield scrapy.Request(url,callback=self.parse_detail) def parse_detail(self,response): item=[] img_paths=[] divs=response.xpath('//*[@id="content-left"]/div[@class="article block untagged mb15"]') for div in divs: QiubaiPic=QiubaiPicItem() src=div.xpath('div[@class="thumb"]/a/img/@src').extract()[0] img_path='http://'+src[2:] img_paths.append(img_path) QiubaiPic['img']=img_paths item.append(QiubaiPic) return item
setting.py文件
# -*- coding: utf-8 -*- import random BOT_NAME = 'imgspider' SPIDER_MODULES = ['imgspider.spiders'] NEWSPIDER_MODULE = 'imgspider.spiders' #浏览器请求头,这个必须要有 USER_AGENT_LIST=[ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1", "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \ "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \ "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \ "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \ "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \ "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ] ua= random.choice(USER_AGENT_LIST) if ua: USER_AGENT =ua print ua else: USER_AGENT="Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" #是否遵循robots协定 ROBOTSTXT_OBEY = False #线程数量 CONCURRENT_REQUESTS = 32 #下载延迟单位秒 DOWNLOAD_DELAY = 3 #cookies开关,建议禁用 COOKIES_ENABLED = False # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { 'scrapy.pipelines.images.ImagesPipline':1}
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1} IMAGES_URLS_FIELD = 'img' IMAGES_STORE = r'/home/lcy/pics' LOG_FILE="scrapy.log"