现在要保存知乎的数据到mysql中,就意味着要重写pipeline来处理所有的item,或者每个网站各有一个pipeline,但是这样子如果有上百个网站,就得写对应数量的pipeline,要向mysql发出上百个连接,显然不合理。并且在同一个twistedpipeline中,只有插入语句是变化的,所以根据不同的item,构建不同的sql语句并插入到mysql中。参考别人的做法,仿照Django中的model,来屏蔽具体的sql语句。所以可以将inser_sql语句放入具体的item中。
items.py
class JobBoleArticleItem(scrapy.Item):
title = scrapy.Field()
create_date = scrapy.Field(
input_processor=MapCompose(date_convert)
)
url = scrapy.Field()
url_object_id = scrapy.Field()
front_image_url = scrapy.Field(
output_processor=MapCompose(return_value)
)
front_image_path = scrapy.Field()
praise_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
comment_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
fav_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
tags = scrapy.Field(
input_processor=MapCompose(remove_comment_tags),
output_processor=Join(",")
)
content = scrapy.Field()
def get_insert_sql(self):
#执行具体的插入
#根据不同的item构建不同的sql语句并插入到mysql中
insert_sql = """
insert into jobbole_article(title, create_date, url, fav_nums,
url_object_id, front_image_url,
praise_nums, comment_nums, tags, content)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
params = (self["title"], self["create_date"], self["url"], self["fav_nums"],
self["url_object_id"], self["front_image_url"], self["praise_nums"],
self["comment_nums"], self["tags"],
self["content"])
return insert_sql, params
pipelines.py
class MysqlTwistedPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbparms = dict(
host = settings["MYSQL_HOST"],
db = settings["MYSQL_DBNAME"],
user = settings["MYSQL_USER"],
passwd = settings["MYSQL_PASSWORD"],
charset = "utf8",
cursorclass = MySQLdb.cursors.DictCursor,
use_unicode = True,
)
dbpool = adbapi.ConnectionPool("MySQLdb", **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
#使用twisted将mysql插入变成异步执行
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error)
def handle_error(self, failure):
#处理异步插入的异常
print (failure)
def do_insert(self, cursor, item):
#执行具体的插入
#根据不同的item构建不同的sql语句并插入到mysql中
insert_sql, params = item.get_insert_sql()
cursor.execute(insert_sql, (item["title"], item["create_date"], item["url"], item["fav_nums"],
item["url_object_id"], item["front_image_url"], item["praise_nums"],
item["comment_nums"], item["tags"],
item["content"]))