python异步批量插入sqlite数据库(mysql同理)

import pymysql
from scrapy.conf import settings
import time
import logging
from io import BytesIO
import base64
import re
from PIL import Image
import sqlite3
from twisted.enterprise import adbapi
from copy import deepcopy, copy


class BaidustreetpanoimgPipeline(object):
    def __init__(self, sqlite_file, sqlite_table):
        # mysql
        self.conn = pymysql.connect(host=settings["MYSQL_HOST"],
                                    user=settings["MYSQL_USER"],
                                    password=settings["MYSQL_PASSWD"],
                                    db=settings["MYSQL_DBNAME"],
                                    port=settings["MYSQL_PORT"],
                                    cursorclass=settings["MYSQL_CURSORCLASS"]  # 查询结果回来是 字段=值 字典
                                    )
        self.cursor = self.conn.cursor()
        self.mysql_table_points = settings["MYSQL_TABLEPOINTS"]
        # sqlite
        self.sqlite_file = sqlite_file
        self.sqlite_table = sqlite_table
        # self.sqlite_conn = sqlite3.connect(self.sqlite_file)
        # self.cur = self.sqlite_conn.cursor()
        self.dbpool = adbapi.ConnectionPool('sqlite3',
                                            database=self.sqlite_file,
                                            check_same_thread=False)

@classmethod
def from_crawler(cls, crawler):
    # 相当于sqlite_file付给了这个类,self中可以得到
    return cls(
        sqlite_file=crawler.settings.get('SQLITE_FILE'),  # 从 settings.py 提取
        sqlite_table=crawler.settings.get('SQLITE_TABLE', 'items')
    )

def open_spider(self, spider):
    # 获取已经爬取过的panoid的img
    if spider.table_name is not None:
        self.mysql_table_points = spider.table_name
        self.sqlite_table = spider.table_name
    select_sql_url = "select * from {0} where TaskName=%s;".format(self.mysql_table_points)
    self.cursor.execute(select_sql_url, (spider.task_name))
    arr_all_panoid = self.cursor.fetchall()
    for p in arr_all_panoid:
        p.pop("TaskName")
        p.pop("ImageWidth")
        spider.panoid_list.append(p)
    self.cursor.close()
    self.conn.close()
    logging.info("********任务开启时间**********" + str(time.time()))

def close_spider(self, spider):
    if len(spider.pgs) > 0:
        logging.info("********待插入条数**********" + str(len(spider.pgs)))
        bulkdata = copy(spider.pgs)
        self.dbpool.runInteraction(self.bulk_insert_to_sqlite, spider.itemkeys, bulkdata)
        # self.bulk_insert_to_sqlite(spider.itemkeys, spider.pgs)
        # 清空缓冲区
        del spider.pgs[:]
    logging.info("********任务结束时间**********" + str(time.time()))

def process_item(self, item, spider):
    self.montage_pics(spider, item)
    if len(spider.pgs) == 1000:
        bulkdata = copy(spider.pgs)
        self.dbpool.runInteraction(self.bulk_insert_to_sqlite, spider.itemkeys, bulkdata)
        # 清空缓冲区
        del spider.pgs[:]
    return item

# 批量插入
def bulk_insert_to_sqlite(self, tx, itemkeys, bulkdata):
    try:
        insert_sql = "insert into {0}({1}) values ({2})".format(self.sqlite_table,
                                                                ', '.join(itemkeys),
                                                                ', '.join(['?'] * len(itemkeys)))
        tx.executemany(insert_sql, bulkdata)
    except sqlite3.Error as why:
        logging.info(why.args[0])
    logging.info("***************大图入库成功**************")

# 拼接且压缩图片
# def deal_pics(self, spider, item):
#     # self.cur = self.sqlite_conn.cursor()
#     self.montage_pics(spider, item)
#     self.sqlite_conn.commit()
#     # self.cur.close()

def montage_pics(self, spider, item):
    # 批量模糊获取key集合
    listkeys = spider.rds.keys(pattern="{}*".format(item["PanoID"]))
    if len(listkeys) == 32:
        # keys排序
        listkeys.sort(key=lambda x: tuple(int(i) for i in re.findall('\d+', str(x))[-3:]))
        # 根据keys集合批量获取value
        total_width = 8 * 512
        total_height = 4 * 512
        new_image = Image.new('RGB', (total_width, total_height))
        x_offset = 0
        y_offset = 0
        count = 1
        images = map(Image.open, [BytesIO(base64.b64decode(data)) for data in spider.rds.mget(listkeys)])
        for subitem in images:
            new_image.paste(subitem, (x_offset, y_offset))
            x_offset += subitem.size[0]
            if count % 8 == 0:
                x_offset = 0
                y_offset += subitem.size[0]
                count = 1
            else:
                count += 1
        imgByteArr = BytesIO()
        # new_image.resize((3328, 1664), Image.ANTIALIAS)
        # new_image.save(imgByteArr, format='JPEG')  # quality
        out = new_image.resize((3328, 1664), Image.ANTIALIAS)
        out.save(imgByteArr, format='JPEG')  # quality
        imgByteArr = imgByteArr.getvalue()
        new_image.close()
        logging.info("***拼接压缩成功***")
        item.update({'Pic': sqlite3.Binary(imgByteArr)})
        if spider.itemkeys is None or spider.itemkeys == "":
            spider.itemkeys = item.keys()
        # 暂存到批量插入集合
        # tuple(item.values())
        spider.pgs.append(tuple(item.values()))
        # 删除redis碎图缓存
        if len(spider.rds.keys(pattern='{0}*'.format(item["PanoID"]))) > 0:
            spider.rds.delete(*spider.rds.keys(pattern='{0}*'.format(item["PanoID"])))
            logging.info("************小碎图缓存删除成功***********" + str(len(spider.pgs)))
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值