单线程阻塞操作MySQL
import MySQLdb
class MysqlPipeline(object):
def __init__(self):
self.conn = MySQLdb.connect('host', 'user', 'passwd', 'dbname', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into dbname(Field,Field)VALUES (%s, %s)
"""
self.cursor.execute(insert_sql, (value,value))
self.conn.commit()
print '正在插入数据'
return item
def close_spider(self, spider):
#接收结束信号
self.conn.close()
print '完成数据插入'
利用scrapy中的Twisted完成异步操作MySQL
“`
from twisted.enterprise import adbapi
import MySQLdb.cursors
class MysqlTwistedPipeline(object):
def __init__(self, dbpoll):
self.dbpoll = dbpoll
self.file = open('mysql_error.json', 'w')
@classmethod
def from_settings(cls, settings):
#会被scrapy调用的类方法,返回一个dbpoll对象传递给MysqlTwistedPipeline类,构建新实例
dbparams = dict(host = settings['MYSQL_HOST'],
db = settings['MYSQL_DBNAME'],
user = settings['MYSQL_USER'],
passwd = settings['MYSQL_PASSWORD'],
charset = 'utf8mb4',
use_unicode = True,
cursorclass = MySQLdb.cursors.DictCursor
)
#第一个参数,操作的数据库,利用反射原理,第二个参数MySQL初始化的一些参数,格式固定
dbpoll = adbapi.ConnectionPool('MySQLdb', **dbparams)
#返货一个MysqlTwistedPipeline的实例对象
return cls(dbpoll)
def process_item(self, item, spider):
#处理item 通过runInteraction执行异步操作,第一个参数是具体的操作过程,第二个参数是操作数据
#并返回一个query,query是Deferred类的实例,调用其中的addErrback方法处理异常
query = self.dbpoll.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider)
return item
def handle_error(self, failure, item , spider ):
#处理MySQL异常 ,异常写入mysql_error.json文件中
item['create_date'] = str(item['create_date'])
error_json = json.dumps(dict(item), ensure_ascii=False)
self.file.write(error_json.encode('utf-8')+'\n'+(str(failure))+'\n')
print (failure)
def do_insert(self, cursor, item):
#执行MySQL语句
insert_sql = """
insert into dbname(Field,Field)VALUES (%s, %s)
"""
cursor.execute(insert_sql, (value,value))
def close_spider(self, spider):
self.file.close()
print '完成mysql_error写入'
其中 charset = ‘utf8mb4’可解决MySQL不能存储特殊表情的问题, 在表结构和MySQL初始化中同时设置即可