首先在setting中配置redis基本信息(如果不设置,直接定义也行) 和ITEM_PIPELINES信息
REDIS_HOST = 'localhost'
REDIS_PORT = 6379
REDIS_DB_INDEX = 0
REDIS_PASSWORD =""
ITEM_PIPELINES = {
'xiaoshuo.pipelines.XiaoshuoPipeline': 200,
'xiaoshuo.pipelines.MysqlPipeline': 300,
'xiaoshuo.pipelines.RedisPipeline': 400,
}
pipelines中的代码中item是spider中获取数据而item是items中定义的数据类型,可能听这比较绕,我直接上pipelines内代码
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import redis
class XiaoshuoPipeline(object):
def process_item(self, item, spider):
return item
class MysqlPipeline(object):
"""
同步操作
"""
def __init__(self):
# 建立连接
self.conn = pymysql.connect('localhost', 'root', '', 'xiao_shuo') # 有中文要存入数据库的话要加charset='utf8'
# 创建游标
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
# sql语句
insert_sql = """
insert into xiao(writer,type,data_time,content) VALUES(%s,%s,%s,%s)
"""
# 执行插入数据到数据库操作
self.cursor.execute(insert_sql, (item['writer'],item['type'], item['data_time'], item['content']))
# 提交,不进行提交无法保存到数据库
self.conn.commit()
return item # 返回item 为其他调用
def close_spider(self, spider):
# 关闭游标和连接
self.cursor.close()
self.conn.close()
class RedisPipeline(object):
def open_spider(self, spider):
# 第一个参数是settings.py里的属性,第二个参数是获取不到值的时候的替代值
host = spider.settings.get("REDIS_HOST", "localhost")
port = spider.settings.get("REDIS_PORT", 6379)
db_index = spider.settings.get("REDIS_DB_INDEX", 0)
db_psd = spider.settings.get("REDIS_PASSWORD", "")
# 连接数据库
self.db_conn = redis.StrictRedis(host=host, port=port, db=db_index, password=db_psd)
def process_item(self, item, spider):
self.db_conn.rpush("url", item['href_url'])
print("*"*100)
return item
def close_spider(self, spider):
# 关闭连接
self.db_conn.connection_pool.disconnect()