1.同步
-
class MysqlPipeline(object):
-
"""
-
采用同步的机制写入mysql
-
"""
-
-
def __init__(self):
-
self.conn = pymysql.connect(host=
"127.0.0.1", user=
"root", password=
"ts123456", db=
"art_schema", port=
3306)
-
self.cursor = self.conn.cursor()
-
-
def process_item(self, item, spider):
-
insert_sql =
"""
-
insert into jobbole_article(title,url,create_date,fav_nums)
-
values({title},{url},{create_date},{fav_nums});
-
""".format(title=item[
'title'], url=item[
'url'], create_date=item[
'create_date'], fav_nums=item[
'fav_nums'])
-
self.cursor.execute(insert_sql)
-
self.conn.commit()
-
return item
2.异步
-
class MysqlTwistedPipeline(object):
-
def __init__(self, dbpool):
-
self.dbpool = dbpool
-
-
@classmethod
-
def from_settings(cls, settings):
-
dbparms = dict(
-
host=settings[
'MYSQL_HOST'],
-
db=settings[
'MYSQL_DBNAME'],
-
user=settings[
'MYSQL_USER'],
-
passwd=settings[
'MYSQL_PASSWORD'],
-
port=settings[
'MYSQL_PORT'],
-
charset=
'utf8',
-
cursorclass=pymysql.cursors.DictCursor,
-
use_unicode=
True
-
)
-
-
dbpool = adbapi.ConnectionPool(
'pymysql', **dbparms)
-
return cls(dbpool)
-
-
def process_item(self, item, spider):
-
"""
-
使用twisted将mysql插入变成异步执行,采用异步的机制写入mysql
-
:param item:
-
:param spider:
-
:return:
-
"""
-
query = self.dbpool.runInteraction(self.do_insert, item)
-
# 处理异常
-
query.addErrback(self.handle_error)
-
-
def handle_error(self, failure):
-
# 处理异步插入的异常
-
print(failure)
-
-
def do_insert(self, cursor, item):
-
insert_sql =
"""
-
insert into jobbole_article(title,url,create_date,fav_nums,url_object_id,front_image_url,
-
front_image_path,praise_nums,comment_nums,tags,content) values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);
-
"""
-
cursor.execute(insert_sql,
-
(item[
'title'], item[
'url'], item[
'create_date'], item[
'fav_nums'], item[
'url_object_id']
-
, item[
'front_image_url'], item[
'front_image_path'], item[
'praise_nums'], item[
'comment_nums']
-
, item[
'tags'], item[
'content']))
3.修改MysqlTwistedPipeline为公用pipeline(防止多个pipeline去打开数据库造成异常)
item.py中的修改,增加一个get_insert_sql()
-
class JobBoleArticleItem(scrapy.Item):
-
title = scrapy.Field(
-
input_processor=MapCompose(
lambda x: x +
"-jobbole")
-
# output_processor=TakeFirst()
-
)
-
create_date = scrapy.Field()
-
url = scrapy.Field()
-
url_object_id = scrapy.Field()
-
front_image_url = scrapy.Field(
-
output_processor=MapCompose(return_value)
-
)
-
front_image_path = scrapy.Field()
-
praise_nums = scrapy.Field(
-
input_processor=MapCompose(get_nums)
-
)
-
fav_nums = scrapy.Field(
-
input_processor=MapCompose(get_nums)
-
)
-
comment_nums = scrapy.Field(
-
input_processor=MapCompose(get_nums)
-
)
-
tags = scrapy.Field(
-
input_processor=MapCompose(remove_comment_tags),
-
output_processor=Join(
",")
-
)
-
content = scrapy.Field()
-
-
def
get_insert_sql
(self):
-
insert_sql =
"""
-
insert into jobbole_article(title,url,create_date,fav_nums,url_object_id,front_image_url,
-
front_image_path,praise_nums,comment_nums,tags,content)
-
values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);
-
"""
-
parsms = (self[
'title'], self[
'url'], self[
'create_date'], self[
'fav_nums'], self[
'url_object_id']
-
, self[
'front_image_url'], self[
'front_image_path'], self[
'praise_nums'], self[
'comment_nums']
-
, self[
'tags'], self[
'content'])
-
return insert_sql,parsms
pipeline中的修改:
-
def do_insert(self, cursor, item):
-
# 执行具体的数据插入
-
-
# 根据不同item构建不同的sql语句并插入到mysql中
-
-
insert_sql, params = item.get_insert_sql()
-
cursor.execute(insert_sql, params)
特别注意:port=3306不能是str类型
转载: https://blog.csdn.net/qq_15695761/article/details/79931120