说明
用途有两个:
- 1 增量更新自己的数据
- 2 当读取的表比较大时分开来读
本例场景限制:只有读表权限
1 内容
1.1 一般情况
通常来说,表里的数据会有行号,但有时也不一定。如果有写表权限的话,那么就自己增加一个行号,便于小批量的读取。
- 在查询中临时生成(但有时候表太大也不行,中间的临时表会太占内存)
select * from
(select * , (@rownum:= @rownum + 1) as rownum from some_table ) as t
where t.rownum > 100
limit 100;
- 直接修改表,增加字段(因为本次没权限,我就没跑)
# 修改表结构
ALTER TABLE t_ds_court ADD row_number int
SET @row = 0
UPDATE some_table SET row_number = (@row: =@row+1)
1.2 根据键值/分类值
问题是表太大,所以需要读取时分块执行
全表很大,但是键值通常并不大
import time
import pymysql
# 获取某一列的数据,加筛选条件
def exe_sql_with_cursor(the_sql, cfg_mysql):
connection = pymysql.connect(**cfg_mysql)
start = time.time() * 1000
try:
with connection.cursor() as cursor:
cursor.execute(the_sql)
connection.commit()
res = cursor.fetchall()
except:
print('[E] SQL Execution Error ')
res = None
finally:
connection.close()
end = time.time() * 1000
print('[I] SQL Done Within %.2f ms' % (end-start))
return res
the_sql = 'select id from some_table'
key_list = exe_sql_with_cursor(the_sql, cfg_mysql)
# 这带宽...
[I] SQL Done Within 298283.59 ms
In [13]: key_list[:3]
Out[13]: ((753117,), (753118,), (753119,))
将列表扁平化后看看是否是等差数列
key_list_ravel = [x[0] for x in key_list]
key_list_ravel.sort()
import numpy as np
# 判断一个一维列表是否为(公差为1的)等差数列
def is_diff1_num(some_list):
some_arr = np.array(some_list)
some_arr.astype(float)
some_arr1 = np.diff(some_arr,1)
diff_nums = np.unique(some_arr1)
if len(diff_nums) == 1:
status = True
else:
status = False
return status, diff_nums[0]
is_diff1, diffnum = is_diff1_num(key_list_ravel[:100])
结果果然不是。然后按照百分位取数(以前用pandas也做过,以后打算慢慢迁移到使用numpy上)
# 获取有效分位数(浮点数) ,最大最小使用实际的数(32位范围)而不是np.inf
def get_list_ptiles(some_list, pct_list = None, cuts = None , min_val=-1e300, max_val=1e300):
if pct_list is None:
# linspace 会生成包含两端的列表 [0. , 0.11111111, 0.22222222, 0.33333333, 0.44444444, 1.]
# 注意要转为0~100的值,不是小数
pct_list = np.linspace(0,1, cuts) *100
# 获取实际分位数后去重
cut_nums = np.unique(np.percentile(some_list, pct_list))
# 为首尾增加最大最小
cut_nums[0] = min_val
cut_nums[-1] = max_val
return list(cut_nums)
把之前的函数稍作修改,以便支持按条件读取
import pymysql as pyl
import pandas as pd
# 按条件读取数据并转为表
def mysql_read_table_where(table_name,where, cfg_mysql=None):
conn = pyl.connect(**cfg_mysql)
try:
with conn.cursor() as cursor:
tem_sql = 'select * from %s ' % table_name + ' ' + where
cursor.execute(tem_sql)
# 增加
col_dis = cursor.description
col = []
for x in list(col_dis):
col.append(x[0])
# print(col)
result = cursor.fetchall()
df = pd.DataFrame(list(result))
df.columns = col
conn.commit()
res = df
except:
res = False
finally:
conn.close()
return res
最后加上tqdm完成整个工作
import tqdm
# 分批次循环的读取mysql里的表格, 使用tqdm显示。需要一个数值型的变量和一个数值型列表(序号列表)。
def read_mysql_table_by_iter(table_name, idx_list ,cfg_mysql, where_template='where id >= %s and id <%s' ):
res_df_list = []
for idx in tqdm.tqdm(range(1, len(idx_list)-1)):
where_condition = where_template % (idx_list[idx], idx_list[idx +1])
tem_df = mysql_read_table_where(table_name, where_condition, cfg_mysql=cfg_mysql)
res_df_list.append(tem_df)
return pd.concat(res_df_list, ignore_index=True)
key_split_index_list = get_list_ptiles(key_list_ravel, cuts=1000)
res_df = read_mysql_table_by_iter('some_table', key_split_index_list, cfg_mysql)