1.同步
class MysqlPipeline(object):
"""
采用同步的机制写入mysql
"""
def __init__(self):
self.conn = pymysql.connect(host="127.0.0.1", user="root", password="ts123456", db="art_schema", port=3306)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into jobbole_article(title,url,create_date,fav_nums)
values({title},{url},{create_date},{fav_nums});
""".format(title=item['title'], url=item['url'], create_date=item['create_date'], fav_nums=item['fav_nums'])
self.cursor.execute(insert_sql)
self.conn.commit()
return item
2.异步
class MysqlTwistedPipeline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbparms = dict(
host=settings['MYSQL_HOST'],
db=settings['MYSQL_DBNAME'],
user=settings['MYSQL_USER'],
passwd=settings['MYSQL_PASSWORD'],
port=settings['MYSQL_PORT'],
charset='utf8',
cursorclass=pymysql.cursors.DictCursor,
use_unicode=True
)
dbpool = adbapi.ConnectionPool('pymysql', **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
"""
使用twisted将mysql插入变成异步执行,采用异步的机制写入mysql
:param item:
:param spider:
:return:
"""
query = self.dbpool.runInteraction(self.do_insert, item)
# 处理异常
query.addErrback(self.handle_error)
def handle_error(self, failure):
# 处理异步插入的异常
print(failure)
def do_insert(self, cursor, item):
insert_sql = """
insert into jobbole_article(title,url,create_date,fav_nums,url_object_id,front_image_url,
front_image_path,praise_nums,comment_nums,tags,content) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);
"""
cursor.execute(insert_sql,
(item['title'], item['url'], item['create_date'], item['fav_nums'], item['url_object_id']
, item['front_image_url'], item['front_image_path'], item['praise_nums'], item['comment_nums']
, item['tags'], item['content']))
3.修改MysqlTwistedPipeline为公用pipeline(防止多个pipeline去打开数据库造成异常)
item.py中的修改,增加一个get_insert_sql()
class JobBoleArticleItem(scrapy.Item):
title = scrapy.Field(
input_processor=MapCompose(lambda x: x + "-jobbole")
# output_processor=TakeFirst()
)
create_date = scrapy.Field()
url = scrapy.Field()
url_object_id = scrapy.Field()
front_image_url = scrapy.Field(
output_processor=MapCompose(return_value)
)
front_image_path = scrapy.Field()
praise_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
fav_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
comment_nums = scrapy.Field(
input_processor=MapCompose(get_nums)
)
tags = scrapy.Field(
input_processor=MapCompose(remove_comment_tags),
output_processor=Join(",")
)
content = scrapy.Field()
def get_insert_sql(self):
insert_sql = """
insert into jobbole_article(title,url,create_date,fav_nums,url_object_id,front_image_url,
front_image_path,praise_nums,comment_nums,tags,content)
values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);
"""
parsms = (self['title'], self['url'], self['create_date'], self['fav_nums'], self['url_object_id']
, self['front_image_url'], self['front_image_path'], self['praise_nums'], self['comment_nums']
, self['tags'], self['content'])
return insert_sql,parsms
pipeline中的修改:
def do_insert(self, cursor, item):
# 执行具体的数据插入
# 根据不同item构建不同的sql语句并插入到mysql中
insert_sql, params = item.get_insert_sql()
cursor.execute(insert_sql, params)
特别注意:port=3306不能是str类型