scrapy文件下载
一、图片下载
1、配置settings
ITEM_PIPELINES = {
#可以是使用框架自带ImagesPipeline下载图片
# 'scrapy.pipelines.images.ImagesPipeline': 300,
'baiduimg.pipelines.PicsDownloadPipeline':300
}
#设置路劲
IMAGES_STORE = 'freebuf'
#指定这个配置后,会出现3张图,一张原图,两张指定大小图
IMAGES_THUMBS = {
'big' :(270,270),
'small' :(100,100)
}
2、写spider
import scrapy
import json
from ..items import BaiduimgItem
class ImgSspider(scrapy.Spider):
name = 'baiduimg'
# allowed_domains = ['']
next_urls = ['http://image.so.com/zjl?ch=pet&sn=%s&listtype=new&temp=1']
start_urls = ['http://image.so.com/zjl?ch=pet&sn=%s&listtype=new&temp=1' % 30]
def parse(self, response):
item = BaiduimgItem()
json_data = json.loads(response.text)
#'image_urls'值必须是数组不然就要重写ImagesPipeline方法
# for x in json_data['list']:
# # print(x['qhimg_url'])
# item['image_urls'] = x['qhimg_url']
# yield item
item['image_urls'] = [x['qhimg_url'] for x in json_data['list']]
yield item
3、重写ImagesPipeline
如果不重写,就会使用默认的ImagesPipeline方法下载图片
import re
from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
class ImagesrenamePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
# 循环每一张图片地址下载,若传过来的不是集合则无需循环直接yield
for image_url in item['imgurl']:
# meta里面的数据是从spider获取,然后通过meta传递给下面方法:file_path
yield Request(image_url,meta={'name':item['imgname']})
# 重命名,若不重写这函数,图片名为哈希,就是一串乱七八糟的名字
def file_path(self, request, response=None, info=None):
# 提取url前面名称作为图片名。
image_guid = request.url.split('/')[-1]
# 接收上面meta传递过来的图片名称
name = request.meta['name']
# 过滤windows字符串,不经过这么一个步骤,你会发现有乱码或无法下载
name = re.sub(r'[?\\*|“<>:/]', '', name)
# 分文件夹存储的关键:{0}对应着name;{1}对应着image_guid
filename = u'{0}/{1}'.format(name, image_guid)
return filename
二、文件下载
1、配置settings.py中启用
ITEM_PIPELINES = {
'scrapy.pipelines.files.FilesPipeline':1,
}
FILE_STORE='E:\scrapy_project\file_download\file'
2、:在item.py中定义file_url和file
class FileDownloadItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
file_urls=scrapy.Field()
files=scrapy.Field()
3、重写pipeline.py
如果不重写pipeline.py,下载的文件是一串哈希值
from scrapy.pipelines.files import FilesPipeline
from urlparse import urlparse
from os.path import basename,dirname,join
class FileDownloadPipeline(FilesPipeline):
def file_path(self, request, response=None, info=None):
path=urlparse(request.url).path
#urlparse方法可以对一个url进行拆分,拆分后其结果如下:
'''
ParseResult(scheme='https',---协议
netloc='mbd.baidu.com', ---域名服务器
path='/newspage/data/landingsuper/animate_decay.py',---文件路劲
params='',---参数
query='',---查询
fragment=''---判断
)
'''
temp=join(basename(dirname(path)),basename(path))
return '%s/%s' % (basename(dirname(path)), basename(path))