mysql数据库
在setting.py文件中
# Mysql Settings
MYSQL_HOST = 'localhost'
MYSQL_DATABASE = 'images360'
MYSQL_PORT = 3306
pipeline.py中
class MysqlPipeline():
def __init__(self, host, database, user, password, port):
self.host = host
self.database = database
self.port = port
@classmethod
def from_crawler(cls, crawler):
return cls(
host=crawler.settings.get('MYSQL_HOST'),
database=crawler.settings.get('MYSQL_DATABASE'),
user=crawler.settings.get('MYSQL_USER'),
password=crawler.settings.get('MYSQL_PASSWORD'),
port=crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self, spider):
self.db = pymysql.connect(self.host,self.database, charset='utf8', port=self.port)
self.cursor = self.db.cursor()
def process_item(self, item, spider):
data = dict(item)
keys = ', '.join(data.keys())
values = ', '.join(['%s'] * len(data))
sql = 'insert into %s (%s) values (%s)' % (item.table, keys, values)
self.cursor.execute(sql, tuple(data.values()))
self.db.commit()
return item
def close_spider(self, spider):
self.db.close()
mongo数据库
在setting.py文件中
MONGO_URL = 'localhost'
MONGO_DB = 'images360'
pipeline.py中
class MongoPipeline(object):
def __init__(self, mongo_url, mongo_db):
self.mongo_url = mongo_url
self.mong_db = mongo_db
@classmethod
def from_crawler(cls, crawler):
# 通过crawler对象拿到Scrapy的所有核心组件(如全局配置信息)并创建一个Pipeline实例
return cls(
mongo_url=crawler.settings.get('MONGO_URL'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self, spider):
# 创建数据库连接对象
self.client = pymongo.MongoClient(self.mongo_url)
# 指定数据库
self.db = self.client[self.mong_db]
def process_item(self, item, spider):
# 将数据插入到指定的表格
self.db[item.collection].insert(dict(item))
return item
def close_spider(self, spider):
# 关闭数据库连接
self.client.close()
自带的imagepipeline下载
setting.py
IMAGES_STORE = './images'
ITEM_PIPELINES = {
'images360.pipelines.MongoPipeline': 300,
'images360.pipelines.MysqlPipeline': 301,
'images360.pipelines.ImagePipeline': 302,
}
pipeline.py
class ImagePipeline(ImagesPipeline):
def file_path(self, request, response=None, info=None):
url = request.url
file_name = url.split('/')[-1]
return file_name
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem('Image Downloaded Failed')
return item
def get_media_requests(self, item, info):
yield Request(item['url'])