说明:主要使用 twisted.enterprise.adbapi 与 pymysql 在 scrapy pipelines 中的应用。包含查询,修改以及新增。
- 参考文章 pipelines.py
1、pipelines.py
import pymysql
from twisted.enterprise import adbapi
from NCSpider.items import NCItem
class MysqlPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
"""
连接MySQL数据库
:param settings:
:return:
"""
params = dict(
host=settings["MYSQL_HOST"],
port=settings['MYSQL_PORT'],
db=settings["MYSQL_DB"],
user=settings["MYSQL_USER"],
password=settings["MYSQL_PASSWORD"],
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool("pymysql", **params)
return cls(dbpool=dbpool)
@staticmethod
def get_update_sql(item, table_name, _id):
"""
获取修改数据库sql
:param item:
:param table_name:
:param _id: 数据id
:return:
"""
v = ','.join(["{}=%s".format(k) for k in item.keys()])
sql = """update `{0}` set {1} where id={2}""".format(table_name, v, _id)
return sql
def _do_novel_update(self, cursor, table_name, item, spider):
"""
修改novel表
:param cursor:
:param table_name:
:param item:
:param spider:
:return:
"""
sql = """select id from `{0}` where novel_id='{1}' limit 1;""".format(table_name, item['novel_id'])
cursor.execute(sql)
# result 是 字典类型
result = cursor.fetchone()
if result:
update_sql = self.get_update_sql(item, table_name, result['id'])
cursor.execute(update_sql, tuple(item.values()))
spider.logger.info('update novel Item2Sql over -- ' + item['novel_en_name'])
def _do_comment_update_or_insert(self, cursor, table_name, item, spider):
"""
修改comments 表
:param cursor:
:param table_name:
:param item:
:param spider:
:return:
"""
sql = """select id, vote_num from `{0}` where comment_id='{1}' limit 1;""".format(table_name, item['comment_id'])
cursor.execute(sql)
# result 是 字典类型
result = cursor.fetchone()
if result:
if str(result['vote_num']) != str(item['vote_num']):
update_sql = self.get_update_sql(item, table_name, result['id'])
cursor.execute(update_sql, tuple(item.values()))
spider.logger.info('update comments Item2Sql over -- ' + item['comment_id'])
else:
insert_sql = self.get_insert_sql(table_name, item)
cursor.execute(insert_sql)
spider.logger.info('save comments Item2Sql over -- ' + item['comment_id'])
@staticmethod
def get_insert_sql(table_name, item):
"""
获取插入数据库sql
:param table_name:
:param item:
:return:
"""
keys = ""
values = ""
for key, value in item.items():
keys += "`" + key + "`,"
if type(value) in [int, float]:
values += str(value) + ","
else:
if value is None:
values += " null,"
else:
values += "'" + pymysql.escape_string(value) + "',"
sql = "INSERT INTO `" + table_name + "` (" + keys[0:-1] + ") VALUES (" + values[0:-1] + ");"
return sql
def process_item(self, item, spider):
try:
if isinstance(item, NCItem):
comment_table = 'comments'
d = self.dbpool.runInteraction(self._do_comment_update_or_insert, comment_table, item, spider)
d.addErrback(self._handle_error, item, spider)
else:
table = 'novel'
d = self.dbpool.runInteraction(self._do_novel_update, table, item, spider)
d.addErrback(self._handle_error, item, spider)
except Exception as e:
spider.logger.error('save Item2Sql Error -- ' + str(e))
return item
@staticmethod
def _handle_error(failure, item, spider):
"""Handle occurred on db interaction."""
spider.logger.error(failure)
注意事项:在使用查询语句时,fetchone() 返回的是字典类型,查询字段作为key。
2、 settings.py 主要配置
# mysql 配置
MYSQL_HOST = 'localhost'
MYSQL_PORT = 3306
MYSQL_USER = 'root'
MYSQL_PASSWORD = ''
MYSQL_DB = ''
ITEM_PIPELINES = {
'NCSpider.pipelines.MysqlPipeline': 300,
}


被折叠的 条评论
为什么被折叠?



