pyodps实践:辨别分区表
【描述】在没有元数据的情况下,判断其他odps数据库中哪些表是分区表,下面的代码进行了一个实践,写的有些粗糙,但是能满足基本需求,希望收获批评以改进。
# -*- coding: utf-8 -*-
from odps import ODPS
import datetime
import sys
reload(sys)
sys.setdefaultencoding('utf8')
#分区字符
dt_str = (datetime.datetime.now()+datetime.timedelta(days=-1)).strftime('%Y%m%d')
def dt_check(aim_list):
#1.获取分区表list: dt_table
dt_table = []
not_dt_table = []
for table_name in aim_list:
t = o.get_table(table_name, project='data_center')
te = t.schema
for col in te:
col_name = col.name
if col_name == 'dt': # 判断分区字段dt
dt_table.append(table_name)
break
else:
not_dt_table.append(table_name)
continue
dis_not_dt_table = set(list(filter(lambda v: v not in dt_table,not_dt_table)))
dis_not_dt_table = list (dis_not_dt_table)
print('分区表如下')
print(dt_table)
print('非分区表如下')
print(dis_not_dt_table)
return dt_table,dis_not_dt_table
def dt_table_look(dt_table):
# 2 获取目标数据:分区表名 分区 分区统计值
numm = 0
for table_name in dt_table:
sql1 = 'SELECT dt,count(*) from data_center.{} group by dt order by dt desc limit 3;'.format(table_name)
table_lis = []
with odps.execute_sql(sql1).open_reader() as reader:
for row in reader:
tmp_row = (table_name,row[0],row[1])
table_lis.append(tmp_row)
# 打印每行数据
print(tmp_row)
numm += 1
print("where dt=(select MAX(dt)from data_center.{})".format(table_name))
print("读取第{}个目标数据完成!".format(numm))
# aim_list 需要改变
aim_list = [
'ds_report_s_d',
'ds_prod_df',
'ds_cata_df',
'ds_valid_old',
'ds_law_old',
'ds_hzkb_old'
]
# 不需要改变
dt_table,dis_not_dt_table = dt_check(aim_list)
dt_table_look(dt_table)