公众号:pythonislover
Python大数据与SQL优化笔记Python读取HDFS统计Hive数据库大小mp.weixin.qq.com
今天发一个项目上实际用到的功能,怎么去用Python读取HDFS实现统计Hive数据库大小,今天没有理论知识,纯代码。
def read_db_size_from_hive():
# read_db_size_from_hive('/warehouse/tablespace/managed/hive/')
# read_db_size_from_hive('/warehouse/tablespace/external/hive/')
hdfs_location_dir = ['/warehouse/tablespace/managed/hive/','/warehouse/tablespace/external/hive/']
try:
hostname = hdfsConfig.hostname
port = hdfsConfig.port
username = hdfsConfig.username
hdfs_url = "http://{}:{}".format(hostname, port)
client = InsecureClient(hdfs_url, user=username)
internal_db_list = [db_name for db_name in client.list(hdfs_location_dir[0]) if db_name.endswith('.db')]
external_db_list = [db_name for db_name in client.list(hdfs_location_dir[1]) if db_name.endswith('.db')]
# print(internal_db_list)
# print(external_db_list)
intersection_db_list = list(set(internal_db_list).intersection(set(external_db_list)))
db_size_list = []
for db_name in intersection_db_list:
db_file_status_internal = client.content(hdfs_location_dir[0] + '/' + db_name)
db_file_status_external = client.content(hdfs_location_dir[1] + '/' + db_name)
db_file_owner = client.status(hdfs_location_dir[0] + '/' + db_name)
if db_file_status_internal.get('spaceConsumed') == 0:
db_folder_size_internal = 1
else:
db_folder_size_internal = db_file_status_internal.get('spaceConsumed')
if db_file_status_external.get('spaceConsumed') == 0:
db_folder_size_external = 1
else:
db_folder_size_external = db_file_status_external.get('spaceConsumed')
db_total_size = db_folder_size_internal + db_folder_size_external
db_name = db_name.replace('.db', '')
db_size_tuple_internal = (db_name, db_file_owner.get('owner', 'hive'), db_total_size, 'Hive', time.strftime("%Y-%m-%d %X"),'NULL')
db_size_list.append(db_size_tuple_internal)
if len(external_db_list) > len(internal_db_list):
difference_db_list = list(set(external_db_list).difference(set(internal_db_list)))
for db_name in difference_db_list:
db_file_status_external = client.content(hdfs_location_dir[1] + '/' + db_name)
db_file_owner = client.status(hdfs_location_dir[1] + '/' + db_name)
if db_file_status_external.get('spaceConsumed') == 0:
db_folder_size_external = 1
else:
db_folder_size_external = db_file_status_external.get('spaceConsumed')
db_total_size = db_folder_size_external
db_name = db_name.replace('.db', '')
db_size_tuple_external = (
db_name, db_file_owner.get('owner', 'hive'), db_total_size, 'Hive', time.strftime("%Y-%m-%d %X"),'NULL')
db_size_list.append(db_size_tuple_external)
else:
difference_db_list = list(set(internal_db_list).difference(set(external_db_list)))
for db_name in intersection_db_list:
db_file_status_internal = client.content(hdfs_location_dir[0] + '/' + db_name)
db_file_owner = client.status(hdfs_location_dir[0] + '/' + db_name)
if db_file_status_internal.get('spaceConsumed') == 0:
db_folder_size_internal = 1
else:
db_folder_size_internal = db_file_status_internal.get('spaceConsumed')
db_total_size = db_folder_size_internal
db_name = db_name.replace('.db', '')
db_size_tuple_internal = (
db_name, db_file_owner.get('owner', 'hive'), db_total_size, 'Hive', time.strftime("%Y-%m-%d %X"),'NULL')
db_size_list.append(db_size_tuple_internal)
return db_size_list
except Exception as e:
print('ERROR (read_db_size_from_hive)' + str(e))
大概说下流程hive数据库在hdfs里面是以文件目录形式存在的,所以统计hive数据库的大小就是,统计hdfs文件目录的大小
hive数据库分成2个路径,用来存放内部表和外部表,所以把两部分加起来才是整个数据库大小
导入hdfs包,可以操作hdfs上的文件,这里关键点是client.status和client.content方法的使用,如果大家有环境可以自己测试下返回值
如果有什么不清楚或者有别的建议的地方欢迎后台留言
本例不是一个完整代码,完整代码是获取pg,hive,gp等数据库的大小,最后存放入mysql数据库中,有兴趣的小伙伴,可以给我留言