大多数数据处理场景中,都是针对一个数据集执行特定的逻辑,然后将结果保存。我记录的这个类,是以Python实现,以MySQL实现存储。
在大量数据库io场景下,使用连接池可以实现异步io、连接资源重用、避免连接中断问题。实现连接池的方式有多种,可以用sqlalchemy实现,下面记录的是以PooledDB的实现
# DBUtils 2.0.2
# PyMySQL 1.0.2
import pymysql
from dbutils.pooled_db import PooledDB
from concurrent.futures import ThreadPoolExecutor
class MysqlClient:
def __init__(self, user, password, database, host="localhost", port=3306, size=8):
self.user = user
self.password = password
self.database = database
self.host = host
self.port = port
self.size = size
self.pool = self.init_pooled_db()
def init_pooled_db(self):
return PooledDB(
creator=pymysql, # 使用链接数据库的模块
maxconnections=self.size, # 连接池允许的最大连接数,0和None表示不限制连接数
mincached=2, # 初始化时,链接池中至少创建的空闲的链接,0表示不创建
maxcached=0, # 链接池中最多闲置的链接,0和None不限制
maxshared=0, # 链接池中最多共享的链接数量,0和None表示全部共享
blocking=True, # 连接池中如果没有可用连接后,是否阻塞等待。True,等待;False,不等待然后报错
maxusage=None, # 一个链接最多被重复使用的次数,None表示无限制
setsession=[], # 开始会话前执行的命令列表。如:["set datestyle to ...", "set time zone ..."]
ping=0,
# ping MySQL服务端,检查是否服务可用。
# 如:0 = None = never,
# 1 = default = whenever it is requested,
# 2 = when a cursor is created,
# 4 = when a query is executed,
# 7 = always
host=self.host,
port=self.port,
user=self.user,
password=self.password,
database=self.database,
charset='utf8'
)
def create_conn(self):
conn = self.pool.connection()
# cursor = conn.cursor(pymysql.cursors.DictCursor)
cursor = conn.cursor()
return conn, cursor
@staticmethod
def close_conn(conn, cursor):
conn.close()
cursor.close()
def select_one(self, sql, args=None):
conn, cur = self.create_conn()
try:
cur.execute(sql, args)
result = cur.fetchone()
return result
except Exception as e:
raise e
finally:
self.close_conn(conn, cur)
def select_all(self, sql, args):
conn, cur = self.create_conn()
try:
cur.execute(sql)
result = cur.fetchall()
return result
except Exception as e:
raise e
finally:
self.close_conn(conn, cur)
def insert_one(self, sql, args=None):
conn, cur = self.create_conn()
try:
result = cur.execute(sql, args)
conn.commit()
return True
except Exception as e:
conn.rollback()
raise e
finally:
self.close_conn(conn, cur)
def delete_one(self, sql, args):
conn, cur = self.create_conn()
try:
result = cur.execute(sql, args)
conn.commit()
return True
except Exception as e:
conn.rollback()
raise e
finally:
self.close_conn(conn, cur)
def update_one(self, sql, args=None):
conn, cur = self.create_conn()
try:
result = cur.execute(sql, args)
conn.commit()
return True
except Exception as e:
conn.rollback()
raise e
finally:
self.close_conn(conn, cur)
对于数据处理业务可以封装成方法,加入线程池机制来执行,最终的结果集的写入可以使用批量写入来减少数据库io,提高写入速度
class ProcessBase:
def __init__(self, user, password, database, host='localhost', port=3306, db_pool_size=8, thread_pool_size=8):
self.mysql_client = MysqlClient(user=user,
password=password,
database=database,
host=host,
port=port,
size=db_pool_size)
self.executor = ThreadPoolExecutor(max_workers=thread_pool_size)
# 当前处理的表
self.process_table = ""
self.process_fields = []
# 提取数据集的where子句
self.process_where_clause = ""
# 检索需要处理的数据集的SQL
self.process_sql = 'SELECT {} FROM {} WHERE {}'.format(','.join(self.process_fields),
self.process_table,
self.process_where_clause)
# 开始处理前是否清空表
self.delete_before = True
self.insert_table = ''
self.insert_fields = []
# 执行插入的SQL的前缀
self.prefix_sql = "INSERT INTO {} ({}) VALUES ".format(self.insert_table,
','.join(self.insert_fields))
self.insert_value_module = "({})".format(",".join(["'{}'" for i in range(len(self.insert_fields))]))
# 数据字典等级
self.dict_level = 0
self.values = []
self.doorplate_dict = {}
self.building_dict = {}
self.unit_dict = {}
self.floor_dict = {}
def task(self, item):
"""
执行数据处理逻辑
:param item: self.process_sql查询出来的每条数据
:return: None. 将处理后的结果写入到结果集self.values
"""
print("无任务处理...")
def make_dict(self, dict_level):
"""
将task执行过程中需要再次查询的数据进行缓存, 可以根据dict_level控制创建行为
:return:
"""
print("无数据字典创建...")
def commit_insert(self, prefix_sql, items, num=100):
"""
批量执行insert操作
:param prefix_sql: INSERT INTO TABLE_NAME(...) VALUES
:param items: 条目数据集
:param num: 批量写入单次的数量
:return: None
"""
start = 0
total = len(items)
while start < total:
sql = prefix_sql + ",".join(items[start: start+num])
print('执行插入: {}~{} / {}'.format(start, start+num, len(items)))
self.executor.submit(self.mysql_client.insert_one, sql, None)
start += num
self.executor.shutdown(wait=True)
def before_task(self):
"""
before_task执行task执行之前的任务,比如将null转换为空字符串''、处理不规则的数据,达到简化task的目的
:return: None
"""
print("task之前无处理...")
def after_task(self):
"""
after_task执行task执行之后的任务。某些处理在task中会增加其复杂性,可以将单独抽取出来。
:return: None
"""
print("task之后无处理...")
def run(self):
if self.delete_before:
if not self.process_table:
raise ValueError("process_table 未配置!")
print('开始清除历史数据...')
self.mysql_client.delete_one("truncate table {}".format(self.process_table), None)
self.make_dict(self.dict_level)
self.before_task()
items = self.mysql_client.select_all(self.process_sql, '')
current = 0
for item in items:
current += 1
print("{} / {}".format(current, len(items)))
self.task(item)
self.commit_insert(
self.prefix_sql,
self.values
)
self.after_task()