spider最简单
# -*- coding: utf-8 -*-
import scrapy,json
from urllib import parse
from Image360.items import Image360Item
class ImagezzSpider(scrapy.Spider):
name = 'Imagezz'
allowed_domains = ['image.so.com']
def start_requests(self):
base_url = 'http://image.so.com/zj?'
data = {'ch':'photography','listtype':'new'}
for page in range(1,2):
data['sn'] = page * 30
parmas = parse.urlencode(data)
url = base_url + parmas
yield scrapy.Request(url,callback=self.parse)
def parse(self, response):
result = json.loads(response.text)
for image in result.get('list'):
item = Image360Item()
item['id'] = image.get('id')
item['image_urls'] = image.get('qhimg_url')
item['title'] = image.get('group_title')
item['thumb'] = image.get('qhimg_thumb_url')
yield item
重点在pipelines,自定义图片下载、MongoDB两个pipelines
import pymongo
class MongoPipeline(object):
def __init__(self,mongo_uri,mongo_db):
#MongoDB的ip和数据库名
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
#借助from_crawler实现在初始化之前对settings参数调用
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_uri=crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def process_item(self,item,spider):
#插入数据
self.db[item.collection].insert(dict(item))
return item
def close_spider(self,spider):
#关闭连接
self.client.close()
from scrapy.http import Request
from scrapy.exceptions import DropItem
from scrapy.pipelines.images import ImagesPipeline
class ImPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
url = item['image_urls']
#下载链接的图片
yield Request(url)
def file_path(self,request,response=None,info=None):
#借助url定义文件名
url = request.url
file_name = url.split('/')[-1]
print(file_name)
return file_name
def item_completed(self,results,item,info):
#确认图片下载完成
image_path = [x['path'] for ok,x in results if ok]
if not image_path:
raise DropItem('Image Download Faild')
yield item
settings设置
#机器人协议关闭
ROBOTSTXT_OBEY = False
#打开图片下载、MongoDB的pipeline
ITEM_PIPELINES = {
'Image360.pipelines.ImPipeline': 300,
'Image360.pipelines.MongoPipeline': 301,
}
#设置图片下载位置
IMAGES_STORE = r'E:\Scrapy'
#MongoDBIP地址、数据库名
MONGO_URI = 'localhost'
MONGO_DB = 'images360'
注意IMAGES_STORE一定不能写错,不然,日志INFO: Enabled item pipelines:编写的ImPipeline没有启用,还不报错,我就找了半天!!!