import sys
import MySQLdb
class CncompanyidSpiderFastPipeline(object):
companylist = []
def open_spider(self, spider):
self.conn = MySQLdb.connect(host="***", user="***", passwd="***",db="***",charset="utf8")
self.cursor = self.conn.cursor()
# 存入数据之前清空表:
self.cursor.execute("truncate table cn_companyid")
self.conn.commit()
# 批量插入mysql数据库
def bulk_insert_to_mysql(self, bulkdata):
try:
print "the length of the data-------", len(self.companylist)
sql = "insert into cn_companyid (id, name) values(%s, %s)"
self.cursor.executemany(sql, bulkdata)
self.conn.commit()
except:
self.conn.rollback()
def process_item(self, item, spider):
self.companylist.append(item)
if len(self.companylist) == item['parmas']['length']:
self.bulk_insert_to_mysql(self.companylist)
#保存至json文件
self.list_to_json(self.jspath + '{}.json'.format(item['param']['path']), self.lirarylist)
# 清空缓冲区
del self.companylist[:]
return item
def close_spider(self, spider):
print "closing spider,last commit", len(self.companylist)
self.bulk_insert_to_mysql(self.companylist)
self.conn.commit()
self.cursor.close()
self.conn.close()
def list_to_json(self, filepath, itemList):
head, till = os.path.split(filepath)
if os.path.exists(head) is False:
os.makedirs(head, exist_ok=True)
with open(filepath, 'a') as f:
line = json.dumps(list(map(lambda x:dict(x), itemList)))
f.write(line)
python scrapy框架通过pipelines批量存储数据到mysql数据库或者json文件
最新推荐文章于 2022-07-19 18:10:12 发布