1.问题背景描述
python项目中的时序数据都存放在TD数据库中,数据是秒级存入的,当查询一周数据时将超过50w数据量,这是一次性获取全量数据到python程序很慢,全流程10秒以上,希望进行优化加速
2.排查
首先,分步排查从td取超过7秒,在python程序中处理格式超过3秒;
其次,业务逻辑处理步骤中,时间大量消耗的逻辑是 时间对象转成字符串;
再次,td取数步骤中,时间大量消耗的逻辑是 获取到数据后时间戳转为时间对象
最后,思路确定为,从td获取的ts字段直接按bigint返回,交由业务逻辑处理,直接从bigint转成字符串;
3.查源码尝试修改
TD取的ts字段是用bigint存的(C_TIMESTAMP类型),排查源码发现公共包中会将C_TIMESTAMP类型的字段都转成datetime对象返回,而转化方法_convert_millisecond_to_datetime就是慢的根源;继续查相关源码,若想用bigint返回,发现CONVERT_FUNC_BLOCK这个函数工厂,key是每个字段类型fields[i][“type”],这个字段类型是在taos_fetch_fields
# TDHelper().db_query(sql) 调用入口
class TDHelper: # 自定义的TD适配逻辑
def db_query(self, sql, return_timestamp=False):
with self.cursor() as c:
return self._query_handler(c, sql, return_timestamp)
def _query_handler(self, cursor, sql, return_timestamp=False):
try:
cursor.execute(sql) # execute中会获取_fields属性,由决定后续字段序列化的逻辑
# cursor._fields[0]._type = 5 # 修改,测试用
result = cursor.fetchall()
if not return_timestamp:
return result
else:
ret_result = []
for one in result:
ret_result.append((one[0].timestamp(), *one[1:]))
return ret_result
except ProgrammingError as e:
if e.msg == 'Fail to get table info, error: Table does not exist':
# 只输出sql,不需要输出异常信息
logger.warning('Table does not exist, sql [{}]'.format(sql))
return []
raise e
# 如下都是TD公共包中的源码
def fetchall(self): # cursor的方法
if self._result is None:
raise OperationalError("Invalid use of fetchall")
fields = self._fields if self._fields is not None else taos_fetch_fields(
self._result)
buffer = [[] for i in range(len(fields))]
self._rowcount = 0
while True:
block, num_of_rows = taos_fetch_block(self._result, self._fields) # 关键逻辑
errno = taos_errno(self._result)
if errno != 0:
raise ProgrammingError(taos_errstr(self._result), errno)
if num_of_rows == 0:
break
self._rowcount += num_of_rows
for i in range(len(self._fields)):
buffer[i].extend(block[i])
return list(map(tuple, zip(*buffer)))
def taos_fetch_block(result, fields=None, field_count=None):
if fields is None:
fields = taos_fetch_fields(result)
if field_count is None:
field_count = taos_field_count(result)
pblock = ctypes.c_void_p(0)
num_of_rows = _libtaos.taos_fetch_block(result, ctypes.byref(pblock))
if num_of_rows == 0:
return None, 0
precision = taos_result_precision(result)
blocks = [None] * field_count
for i in range(len(fields)):
data = ctypes.cast(pblock, ctypes.POINTER(ctypes.c_void_p))[i]
if fields[i]["type"] not in CONVERT_FUNC_BLOCK_v3 and fields[i]["type"] not in CONVERT_FUNC_BLOCK:
raise DatabaseError("Invalid data type returned from database")
offsets = []
is_null = []
if fields[i]["type"] in (FieldType.C_VARCHAR, FieldType.C_NCHAR, FieldType.C_JSON):
offsets = taos_get_column_data_offset(result, i, num_of_rows)
blocks[i] = CONVERT_FUNC_BLOCK_v3[fields[i]["type"]](data, is_null, num_of_rows, offsets, precision)
else:
is_null = [taos_is_null(result, j, i) for j in range(num_of_rows)]
# 关键逻辑
blocks[i] = CONVERT_FUNC_BLOCK[fields[i]["type"]](data, is_null, num_of_rows, offsets, precision)
return blocks, abs(num_of_rows)
CONVERT_FUNC_BLOCK = {
FieldType.C_BOOL: _crow_bool_to_python,
FieldType.C_TINYINT: _crow_tinyint_to_python,
FieldType.C_SMALLINT: _crow_smallint_to_python,
FieldType.C_INT: _crow_int_to_python,
FieldType.C_BIGINT: _crow_bigint_to_python,
FieldType.C_FLOAT: _crow_float_to_python,
FieldType.C_DOUBLE: _crow_double_to_python,
FieldType.C_BINARY: _crow_binary_to_python_block,
FieldType.C_TIMESTAMP: _crow_timestamp_to_python, # 关键逻辑
FieldType.C_NCHAR: _crow_nchar_to_python_block,
FieldType.C_TINYINT_UNSIGNED: _crow_tinyint_unsigned_to_python,
FieldType.C_SMALLINT_UNSIGNED: _crow_smallint_unsigned_to_python,
FieldType.C_INT_UNSIGNED: _crow_int_unsigned_to_python,
FieldType.C_BIGINT_UNSIGNED: _crow_bigint_unsigned_to_python,
FieldType.C_JSON: _crow_nchar_to_python_block,
}
def _crow_timestamp_to_python(data, is_null, num_of_rows, nbytes=None, precision=FieldType.C_TIMESTAMP_UNKNOWN):
"""Function to convert C bool row to python row."""
_timestamp_converter = _convert_millisecond_to_datetime # 关键逻辑
if precision == FieldType.C_TIMESTAMP_MILLI:
_timestamp_converter = _convert_millisecond_to_datetime
elif precision == FieldType.C_TIMESTAMP_MICRO:
_timestamp_converter = _convert_microsecond_to_datetime
elif precision == FieldType.C_TIMESTAMP_NANO:
_timestamp_converter = _convert_nanosecond_to_datetime
else:
raise DatabaseError("Unknown precision returned from database")
return [
None if is_null[i] else _timestamp_converter(ele)
for i, ele in enumerate(ctypes.cast(data, ctypes.POINTER(ctypes.c_int64))[: abs(num_of_rows)])
]
def _convert_millisecond_to_datetime(milli):
try:
if _priv_tz is None:
return _datetime_epoch + timedelta(seconds=milli / 1000.0)
return (_utc_datetime_epoch + timedelta(seconds=milli / 1000.0)).astimezone(_priv_tz) # 万恶之源
except OverflowError:
# catch OverflowError and pass
print("WARN: datetime overflow!")
pass
4. 最终修改和效果
修改ts字段的类型从C_TIMESTAMP改为C_BIGINT,相关逻辑如下,
参数说明:cursor 即TDHelper().cursor()获得,sql = ‘select ts,val from table_1 ORDER BY ts desc limit 1’
最后效果,50w数据从10s优化到3s
def test(cursor, sql)
try:
cursor.execute(sql)
# ts字段的类型修改为bigint
if cursor._fields[0]._type == FieldType.C_TIMESTAMP:
cursor._fields[0]._type = FieldType.C_BIGINT
return cursor.fetchall()
except ProgrammingError as e:
if e.msg == 'Fail to get table info, error: Table does not exist':
# 只输出sql,不需要输出异常信息
logger.warning('Table does not exist, sql [{}]'.format(sql))
return []
raise e