pipelines.py
import json
from scrapy.conf import settings
from pymongo import MongoClient
class SunPipeline(object):
def open_spider(self,spider):
self.file = open('dongguan.json','w')
def process_item(self, item, spider):
str_data = json.dumps(dict(item),ensure_ascii=False) + ',\n'
self.file.write(str_data)
return item
def close_spider(self,spider):
self.file.close()
class MongoPipeline(object):
def __init__(self):
# 获取数据库参数
host = settings['MONGO_HOST']
port = settings['MONGO_PORT']
dbname = settings['MONGO_DBNAME']
colname = settings['MONGO_COLNAME']
# 连接数据库
self.client = MongoClient(host, port)
# 选择数据库
self.db = self.client[dbname]
# 选择集合
self.col = self.db[colname]
def process_item(self, item, spider):
data = dict(item)
self.col.insert(data)
return item
def __del__(self):
# 关闭数据库链接
self.client.close()
settings.py
BOT_NAME = 'Sun'
SPIDER_MODULES = ['Sun.spiders']
NEWSPIDER_MODULE = 'Sun.spiders'
MONGO_HOST = '127.0.0.1'
MONGO_PORT = 27017
MONGO_DBNAME = 'Sun'
MONGO_COLNAME = 'dongguan'
ITEM_PIPELINES = {
'Sun.pipelines.SunPipeline': 300,
'Sun.pipelines.MongoPipeline': 301,
}