介绍
对于一个web服务,性能的瓶颈最终基本上都会出现在数据库读取的这一步上,如果能够在数据库读取数据的这一段时间自动切换去处理其他请求的话,服务的性能会得到非常显著的提升,因此需要选择一个合适的异步驱动和工具包
SQLAlchemy是一个python中发展比较成熟的ORM数据库工具包,在比较早期的时候它只是一个同步ORM,在1.4版本的时候引入了协程并支持了异步的功能,下面以最通用和常用为出发点,选择aiomysql + sqlalchemy介绍一些比较好的实践
本篇文章内容主要包含:
- SQLAlchemy的一些用法和最佳实践介绍
- 一个使用SQLAlchemy连接实现pandas的异步read_sql_query方法的实现(也是对上一篇文章末尾提的建议做一个结束)
创建异步引擎并执行SQL查询
from sqlalchemy.engine import URL
from sqlalchemy.ext.asyncio import create_async_engine, AsyncEngine
engine: AsyncEngine = create_async_engine(
URL.create("mysql+aiomysql", "root", "root", "localhost", 3306, "mysql"),
)
创建方式和同步的create_engine()方法没有很大的差别,参数都是一样的
创建好查询引擎之后,接下来使用创建的引擎操作数据库读取数据
# -*- coding: utf-8 -*-
import asyncio
from sqlalchemy import text
from sqlalchemy.engine import URL
from sqlalchemy.ext.asyncio import create_async_engine, AsyncEngine
loop = asyncio.get_event_loop()
engine: AsyncEngine = create_async_engine(
URL.create("mysql+aiomysql", "root", "root", "localhost", 3306, "services"),
)
# test_table:
# id name value
# 0 1 abc 123
# 1 2 def 456
# 2 3 fgi 789
# 3 4 fdsas 654
# 4 5 asda 111
async def get_data():
async with engine.connect() as conn:
# query = text("select * from test_table where id > :id or id = :id2").bindparams(id=3, id2=1)
query = text("select * from test_table").bindparams(**{})
result = await conn.execute(query)
ttl = result.rowcount
print(f"总行数:{ttl}")
data = result.fetchone()
print(data, type(data)) # 获取到的data是一个Row对象
print(dict(data)) # 可以直接转成字典
length = 1
print(f"进度:{length / ttl}")
# 继续fetchone()会从下一条数据开始获取
data = result.fetchone()
print(data)
length += 1
print(f"进度:{length / ttl}")
data = result.fetchmany(2)
print(data)
# 将数据转成比较通用的"records"格式
print(list(map(dict, data)))
length += len(data)
print(f"进度:{length / ttl}")
# fetchmany()也会按继续执行,没有数据之后会返回空列表[]
data = result.fetchmany(5)
print(data)
print(list(map(dict, data)))
length += len(data)
print(f"进度:{length / ttl}")
data = result.fetchall()
print(data)
print(list(map(dict, data)))
length += len(data)
print(f"进度:{length / ttl}")
使用engin.connect()从实例中维护的连接池中获取一个连接,然后调用其execute方法执行查询即可,注意这里的sql需要用SQL alchemy的text包起来才能查询,然后就是这里的execute得到的对象和单独使用aiomysql框架的连接时稍有不同,execute结果会返回一个CursorResult对象,然后使用同步的方式对他执行fetchone(), fetchmany(), fetchall()等方法,而且调用多次fetchone()会逐行返回而调用fetchmany()方法也是会从上一次的fetch之后继续读取,相当于再调用一个生成器的next方法
前面说了sql传入执行之前要用SQL alchemy的text方法包起来,这样子做稍微显得有点麻烦,不过这样子做可以让我们很方便地使用占位符:
async def get_data2():
async with engine.connect() as conn:
query = text("select * from test_table where id > :id or id = :id2").bindparams(id=2, id2=1)
# query = text("select * from test_table").bindparams(**{})
result = await conn.execute(query)
# await conn.commit() # 如果修改数据库记得要提交
data = result.fetchall()
print(data)
print(list(map(dict, data)))
[(1, 'abc', '123'), (3, 'fgi', '789'), (4, 'fdsas', '654'), (5, 'asda', '111')]
[{'id': 1, 'name': 'abc', 'value': '123'}, {'id': 3, 'name': 'fgi', 'value': '789'}, {'id': 4, 'name': 'fdsas', 'value': '654'}, {'id': 5, 'name': 'asda', 'value': '111'}]
其他删除更新等操作可以通过写sql然后放到text中去执行,也可以使用SQL alchemy支持的相关方法,可以访问官网介绍进行更多学习
下面用SQLAlchemy结合pandas的read_sql_query方法修改成异步方式的实践作为结尾
异步SQLAlchemy引擎实现异步的pandas read_sql_query方法
# -*- coding:utf-8 -*-
from __future__ import annotations
import asyncio
import logging
import pandas as pd
from pandas._typing import DtypeArg
from pandas.io.sql import _convert_params, _wrap_result
from sqlalchemy import text
from sqlalchemy.engine import URL
from sqlalchemy.ext.asyncio import AsyncEngine
from sqlalchemy.ext.asyncio import create_async_engine
loop_ = asyncio.get_event_loop()
class AsyncEngineSQLDatabase:
def __init__(self, engine):
self.conn: AsyncEngine = engine
self.progress = None
async def execute(self, *args, **kwargs):
async with self.conn.connect() as conn:
result = await conn.execute(*args, **kwargs)
await conn.commit()
return result
async def async_read_query(
self,
sql,
index_col=None,
coerce_float: bool = True,
params=None,
parse_dates=None,
chunksize: int | None = None,
dtype: DtypeArg | None = None,
):
args = _convert_params(sql, params)
result = await self.execute(*args)
columns = result.keys()
if chunksize is not None:
return self._query_iterator(
result,
chunksize,
columns,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
dtype=dtype,
)
else:
data = result.fetchall()
frame = _wrap_result(
data,
columns,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
dtype=dtype,
)
return frame
async_read_sql = async_read_query
async def _query_iterator(
self,
result,
chunksize: int,
columns,
index_col=None,
coerce_float=True,
parse_dates=None,
dtype: DtypeArg | None = None,
):
"""Return generator through chunked result set"""
has_read_data = False
total_size = result.rowcount
read_size = 0
while True:
# 本地读取太快了,sleep来方便调试
data = result.fetchmany(chunksize)
read_size += len(data)
progress = f"{round(read_size / total_size, 4): .2%}"
self.progress = progress
if not data:
if not has_read_data:
yield _wrap_result(
[],
columns,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
)
break
else:
has_read_data = True
yield _wrap_result(
data,
columns,
index_col=index_col,
coerce_float=coerce_float,
parse_dates=parse_dates,
dtype=dtype,
)
class AsyncMysqlEngine:
def __init__(self, **kwargs):
self.engine: AsyncEngine = create_async_engine(
URL.create("mysql+aiomysql", "root", "root", "localhost", 3306, "services")
)
async def execute(self, sql, bind_params=None, is_fetchone=False, fetchmany=None):
if hasattr(sql, "get_sql"):
sql = sql.get_sql()
bind_params = bind_params or {}
async with self.engine.connect() as conn:
try:
sql = text(sql).bindparams(**bind_params)
result = await conn.execute(sql)
rowcount = result.rowcount
if is_fetchone:
data = result.fetchone()
data = dict(data)
else:
data = result.fetchall()
data = list(map(dict, data))
await conn.commit() # update时需要commit,但其他类型查询例如select时commit也无妨
return rowcount, data
except Exception as e:
logging.error(
f"SQL execution met an unexpected error, "
f"hint:\n {e}\n, "
f"the error query sql is: \n{sql}\n"
)
await conn.rollback()
raise e
async def async_read_query(
self,
sql,
index_col=None,
coerce_float: bool = True,
params=None,
parse_dates=None,
chunksize: int | None = None,
dtype: DtypeArg | None = None,
):
conn = self.engine
async_engine_database = AsyncEngineSQLDatabase(conn)
params = params or {}
sql = text(sql).bindparams(**params)
frame_or_async_generator = await async_engine_database.async_read_query(
sql,
index_col,
coerce_float,
params,
parse_dates,
chunksize,
dtype
)
if chunksize:
frames = [pd.DataFrame()]
async for frame_part in frame_or_async_generator:
progress = async_engine_database.progress
print(progress)
frames.append(frame_part)
df = pd.concat(frames, ignore_index=True)
return df
return frame_or_async_generator
async_engine = AsyncMysqlEngine()
async def run_test2():
sql = "select * from test_table"
df = await async_engine.async_read_query(sql)
print(df)
async def run_test3():
sql = "select * from test_table where id > :id"
df = await async_engine.async_read_query(sql, params={"id": 1}, chunksize=3)
print(df)
if __name__ == '__main__':
# loop_.run_until_complete(run_test2())
loop_.run_until_complete(run_test3())
pass