mysql:
首先配置文件:
ITEM_PIPELINES = { firstbloodpro.pipelines.MysqlproPipeline:300},配置好管道
第二配置好所需要的用户名等
HOST='localhost'
POST=3306
USER='root'
PWD='123456'
DB='lala'
CHARSET = 'utf8'
管道中:
from scrapy.utils.project import get_project_settings
import pymysql
class MysqlproPipeline(object):
def open_spider(self,spider):
# setting 就是一个字典,字典的键值就是所有的配置选项
settings = get_project_settings()
self.db = pymysql.Connect(host = settings['HOST'],port = ['PORT'],user = ['USER'],pwd= ['PWD'],db = ['lala'],charset=['utf8'])
def close_spider(self,spider):
self.db.close()
def process_item(self,item,spider):
self.save_to_mysql(item)
retrun item
def save_to_mysql(self,item):
# 获取cursor
cursor = self.db.cursor()
# 拼接sql语句
sql = 'insert into haha(face, name,age, content,haha_count, ping_count) values("%s","%s","%s","%s","%s","%s")' % (item['face'], item['name'], item['age'], item['content'], item['haha_count'], item['ping_count'])
# 执行sql语句
try :
cursor.execute(sql)
self.db.commit()
except Exception as e:
print (e)
self .db.rollback()
mongodb:.
1 配置文件:
ITEM_PIPELINES = { firstbloodpro.pipelines.MongodbproPipeline:300},
2:管道文件:
import pymongo
class MongodbproPiepeline(object):
def open_spider(self,spider):
self.client = pymongo.MongoClient(host = 'localhost',port=27017)
def close_spider(self,spider):
self.client.close()
def process_item(self,item,spider):
# 选择数据库
db.self.client.xxx
# 选择集合
col = db.xxxx
#将item转化为字典
dic = dict(item)
col.insert(dic)
return item
sqlite:
在 管道文件中
import sqlite3
class Sqlite3proPipeline(object):
def open_spider(self,spider):
self.db = sqlite3.connect(home.db)
self.cur = self.db.cursor()
def close_spider(self,spider):
self.db.close()
def process_item(self,item,spider):
self.save_to_sqlite(item)
return item
def save_to_sqlite(self,item):
sql = 'insert into dameo(city,title,rentway,price,housetype,area,address,traffic) values("%s","%s","%s","%s","%s","%s","%s","%s")' % (
item['city'], item['title'], item['rentway'], item['price'], item['housetype'], item['area'],item['address'], item['traffic'])'
try:
self.cur.execute(sql)
self.db.commit()
except Exception as e:
print(e)
self.db.rollback()
return item
在配置文件中
ITEM_PIPELINES = { firstbloodpro.pipelines.Sqlite3proPipeline:300},
redis:
在配置文件中:
将DOWNLOAD_DELAY = 3 下面的全部换成这个
# 指定使用scrapy-redis的调度器
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
# 指定使用scrapy-redis的去重
DUPEFILTER_CLASS = 'scrapy_redis.dupefilter.RFPDupeFilter'
# 指定排序爬取地址时使用的队列,
# 默认的 按优先级排序(Scrapy默认),由sorted set实现的一种非FIFO、LIFO方式。
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
# 可选的 按先进先出排序(FIFO)
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderQueue'
# 可选的 按后进先出排序(LIFO)
# SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderStack'
# 在redis中保持scrapy-redis用到的各个队列,从而允许暂停和暂停后恢复,也就是不清理redis queues
SCHEDULER_PERSIST = True
# 只在使用SpiderQueue或者SpiderStack是有效的参数,指定爬虫关闭的最大间隔时间
# SCHEDULER_IDLE_BEFORE_CLOSE = 10
# 通过配置RedisPipeline将item写入key为 spider.name : items 的redis的list中,供后面的分布式处理item
# 这个已经由 scrapy-redis 实现,不需要我们写代码
ITEM_PIPELINES = {
'posted.pipelines.PostedPipeline': 300,
'scrapy_redis.pipelines.RedisPipeline': 400
}
# 指定redis数据库的连接参数
# REDIS_PASS是我自己加上的redis连接密码(默认不做)
REDIS_HOST = '127.0.0.1'
REDIS_PORT = 6379
#REDIS_PASS = 'redisP@ssw0rd'
# LOG等级
LOG_LEVEL = 'DEBUG'
#默认情况下,RFPDupeFilter只记录第一个重复请求。将DUPEFILTER_DEBUG设置为True会记录所有重复的请求。
DUPEFILTER_DEBUG =True
# 覆盖默认请求头,可以自己编写Downloader Middlewares设置代理和UserAgent
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Language': 'zh-CN,zh;q=0.8',
'Connection': 'keep-alive',
'Accept-Encoding': 'gzip, deflate, sdch'
}