hive脚本查看表更新时间
Python hive查看表更新时间
from impala.dbapi import connect
from pandasql import sqldf
pysqldf = lambda x: sqldf(x, globals())
from sqlalchemy import create_engine
import pandas as pd
import arrow,time
from impala.util import as_pandas
def search_big_data_df(sql):
# conn = connect(host='192.168.249.212', port=10000, user='yinboya',auth_mechanism='PLAIN')
conn = connect(host='nn1.hadoop.com', port=10000, user='hueadmin',auth_mechanism='PLAIN',timeout=False)
cur = conn.cursor()
df = pd.DataFrame()
try:
for i in sql.split(';'):
cur.execute(i)
df = as_pandas(cur)
except Exception as e:
import traceback
print(traceback.format_exc())
conn.close()
return df
sql = """
use ods;
show tables
"""
dftables = search_big_data_df(sql)# 获取ods库所有表
#循环扫描所有表更新时间
for t in dftables.to_dict('r'):
sql = """
dfs -ls /user/hive/warehouse/ods.db/{0}
""".format(t['tab_name'])
# print(sql)
df = search_big_data_df(sql)
df = df.iloc[1:,:]
df['DFS Output']=df['DFS Output'].apply(lambda x:" ".join(x.split(' ')[-3:-1]))
df.sort_values('DFS Output',inplace=True)
# 拿取最新块更新时间
time_str = list(df['DFS Output'])[-1]
# 时间比对 小于三天时间打印出来
if arrow.get(time_str)<arrow.now().shift(days=-3):
print(t['tab_name'],time_str)