方法一:采用同步的机制写入mysql,先爬取item按序写入。
假设在这里插入‘text’和‘author’两项数据
先通过Navicat创建数据库(article_spider)、数据表(article)以及数据的类型等(text longtext not null, author varchar 100 not null)
然后在pipeline中设置如下:
import MySQLdb
class MysqlPipeline(object):
#采用同步的机制写入mysql
def __init__(self):
self.conn = MySQLdb.connect('127.0.0.1', 'root', '123456', 'article_spider', charset="utf8", use_unicode=True)
self.cursor = self.conn.cursor()
def process_item(self, item, spider):
insert_sql = """
insert into article(text, author)
VALUES (%s, %s)
"""
self.cursor.execute(insert_sql, (item["text"], item["author"]))
self.conn.commit()
然后在settings中开启pipeline即可。
方法二:异步插入数据,不会因为爬取的item太多而导致数据拥堵,效率更高
先在setting中提供如下信息:这些都是需要自定义的
MYSQL_HOST = '127.0.0.1'
MYSQL_DBNAME = 'article_spider'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'
然后在pipeline中设置如下:
import MySQLdb
import MySQLdb.cursors
import twisted.enterprise import adbapi
class MysqlTwistedPipline(object):
def __init__(self, dbpool):
self.dbpool = dbpool
@classmethod
def from_settings(cls, settings):
dbparms = dict(
host = settings["MYSQL_HOST"],
db = settings["MYSQL_DBNAME"],
user = settings["MYSQL_USER"],
passwd = settings["MYSQL_PASSWORD"],
charset='utf8',
cursorclass=MySQLdb.cursors.DictCursor,
use_unicode=True,
)
dbpool = adbapi.ConnectionPool("pymysql", **dbparms)
return cls(dbpool)
def process_item(self, item, spider):
#使用twisted将mysql插入变成异步执行
query = self.dbpool.runInteraction(self.do_insert, item)
query.addErrback(self.handle_error, item, spider) #处理异常
def handle_error(self, failure, item, spider):
#处理异步插入的异常
print (failure)
def do_insert(self, cursor, item):
#执行具体的插入
#根据不同的item 构建不同的sql语句并插入到mysql中
# insert_sql, params = item.get_insert_sql()
# cursor.execute(insert_sql, params)
insert_sql = """
insert into article(text, author,tags)#这里数据表设置名称为article
VALUES (%s, %s, %s)#有几个item就有几个%s
"""
cursor.execute(insert_sql, (item["text"], item["author"], item['tags']))
然后在settings中开启pipeline即可。