文章目录
关于hive中Statistics特性请查看Statistics in Hive
1. 相关元数据表介绍
DBS
表用于存储hive数据库信息, 包含数据库名、文件系统路径等。
通过DB_ID
字段与其他表相关联。
TBLS
表用于存储Hive表、视图、索引表的基本信息,包含表名、表类型(外部表等)、所属数据库id、创建时间等信息。
该表中有全部hive表信息
TABLE_PARAMS
表存储全部hive表的一些额外信息(表属性以及其他统计信息)。该表只有3个字段列:TBL_ID
、PARAM_KEY
和PARAM_VALUE
,表的信息使用key的方式进行储存。
通过TBL_ID
与TBLS
表关联。
该表中有全部hive表信息,但是只存储非分区表的统计信息。
numFiles:数据集包含的文件数
numRows:数据集包含的行数
rawDataSize:数据集原始数据大小
totalSize:文件系统级别看的数据集大小,如果使用hdfs存储数据文件,则该值为hdfs上实际的字节数
PARTITIONS
表存储分区表的分区信息,包含分区名称、所属表id、创建时间等信息。
通过TBL_ID与TBLS
表关联。
只包含分区表的相关信息。
PARTITION_PARAMS
表存储分区表的一些额外分区信息。该表只有3个字段PART_ID
、PARAM_KEY
和PARAM_VALUE
,分区信息使用key的方式进行存储。类似于TABLE_PARAMS
表。
通过PART_ID与PARTITIONS
表关联。
存储分区表的统计信息,统计信息的key同
TABLE_PARAMS
表介绍。
2. 示例
查看非分区表的磁盘占用,最终结果以数据库进行汇总。
SELECT tmp.dbName
,tmp.TBL_ID
,tmp.TBL_NAME
,SUM(totalSize) AS totalSize
FROM
(
SELECT b.`NAME` AS dbName -- 数据库名称
,a.TBL_ID -- 表id
,a.TBL_NAME -- 表名
,d.PARAM_VALUE AS numFiles -- 表下文件数
-- ,e.PARAM_VALUE AS numRows -- 表下记录数
-- ,f.PARAM_VALUE AS rawDataSize -- 表原始数据大小
,g.PARAM_VALUE AS totalSize -- 占用hdfs空间大小
FROM TBLS AS a -- 存储Hive表、视图、索引表的基本信息
LEFT JOIN DBS AS b -- 存储Hive中所有数据库的基本信息
ON a.DB_ID = b.DB_ID
LEFT JOIN PARTITIONS AS c -- 存储表分区信息
ON a.TBL_ID = c.TBL_ID
LEFT JOIN TABLE_PARAMS AS d -- 存储表的一些额外信息,如果是非分区表还有该表对应的HDFS文件数量和大小
ON a.TBL_ID = d.TBL_ID
-- LEFT JOIN TABLE_PARAMS AS e
-- ON a.TBL_ID = e.TBL_ID
-- LEFT JOIN TABLE_PARAMS AS f
-- ON a.TBL_ID = f.TBL_ID
LEFT JOIN TABLE_PARAMS AS g
ON a.TBL_ID = g.TBL_ID
WHERE c.PART_ID IS null -- 排除分区表
AND d.PARAM_KEY = "numFiles"
-- AND e.PARAM_KEY = "numRows"
-- AND f.PARAM_KEY = "rawDataSize"
AND g.PARAM_KEY = "totalSize"
ORDER BY b.NAME asc, a.TBL_ID ASC
) AS tmp
GROUP BY tmp.dbName
,tmp.TBL_ID
,tmp.TBL_NAME
查看分区表的磁盘占用,最终结果以数据库进行汇总。
SELECT tmp.dbName
,tmp.TBL_ID
,tmp.TBL_NAME
,SUM(totalSize) AS totalSize
FROM
(
SELECT b.NAME AS dbName
,a.TBL_ID
,a.TBL_NAME
,c.PART_ID
,c.PART_NAME
,d.PARAM_VALUE AS numFiles -- 分区下文件数
-- ,e.PARAM_VALUE AS numRows -- 分区下记录数
-- ,f.PARAM_VALUE AS rawDataSize -- 分区下原始数据大小
,g.PARAM_VALUE AS totalSize -- 占用hdfs空间大小
FROM TBLS AS a
JOIN DBS AS b
ON a.DB_ID = b.DB_ID
LEFT JOIN PARTITIONS AS c -- 存储表分区信息
ON a.TBL_ID = c.TBL_ID
LEFT JOIN PARTITION_PARAMS AS d -- 分区表的额外信息
ON c.PART_ID = d.PART_ID
-- LEFT JOIN PARTITION_PARAMS AS e
-- ON c.PART_ID = e.PART_ID
-- LEFT JOIN PARTITION_PARAMS AS f
-- ON c.PART_ID = f.PART_ID
LEFT JOIN PARTITION_PARAMS AS g
ON c.PART_ID = g.PART_ID
WHERE c.PART_ID IS NOT null
AND d.PARAM_KEY = "numFiles"
-- AND e.PARAM_KEY = "numRows"
-- AND f.PARAM_KEY = "rawDataSize"
AND g.PARAM_KEY = "totalSize"
ORDER BY b.NAME ASC, c.PART_ID ASC
) AS tmp
GROUP BY tmp.dbName
,tmp.TBL_ID
,tmp.TBL_NAME
查看全部表的磁盘占用,上述两个示例的汇总。
SELECT tmp.dbName
,FORMAT(SUM(totalSize)/1024/1024/1024,2) AS totalSize_gb
FROM
(
SELECT tmp.dbName
,tmp.TBL_ID
,tmp.TBL_NAME
,SUM(totalSize) AS totalSize
FROM
(
SELECT b.NAME AS dbName
,a.TBL_ID
,a.TBL_NAME
,c.PART_ID
,c.PART_NAME
,d.PARAM_VALUE AS numFiles -- 分区下文件数
,g.PARAM_VALUE AS totalSize -- 占用hdfs空间大小
FROM TBLS AS a
JOIN DBS AS b
ON a.DB_ID = b.DB_ID
LEFT JOIN PARTITIONS AS c -- 存储表分区信息
ON a.TBL_ID = c.TBL_ID
LEFT JOIN PARTITION_PARAMS AS d -- 分区表的额外信息
ON c.PART_ID = d.PART_ID
LEFT JOIN PARTITION_PARAMS AS g
ON c.PART_ID = g.PART_ID
WHERE c.PART_ID IS NOT null -- 分区表信息
AND d.PARAM_KEY = "numFiles"
AND g.PARAM_KEY = "totalSize"
ORDER BY b.NAME ASC, c.PART_ID ASC
) AS tmp
GROUP BY tmp.dbName
,tmp.TBL_ID
,tmp.TBL_NAME
UNION ALL
SELECT tmp.dbName
,tmp.TBL_ID
,tmp.TBL_NAME
,SUM(totalSize) AS totalSize
FROM
(
SELECT b.`NAME` AS dbName -- 数据库名称
,a.TBL_ID -- 表id
,a.TBL_NAME -- 表名
,d.PARAM_VALUE AS numFiles -- 表下文件数
,g.PARAM_VALUE AS totalSize -- 占用hdfs空间大小
FROM TBLS AS a -- 存储Hive表、视图、索引表的基本信息
LEFT JOIN DBS AS b -- 存储Hive中所有数据库的基本信息
ON a.DB_ID = b.DB_ID
LEFT JOIN PARTITIONS AS c -- 存储表分区信息
ON a.TBL_ID = c.TBL_ID
LEFT JOIN TABLE_PARAMS AS d -- 存储表的一些额外信息,如果是非分区表还有该表对应的HDFS文件数量和大小
ON a.TBL_ID = d.TBL_ID
LEFT JOIN TABLE_PARAMS AS g
ON a.TBL_ID = g.TBL_ID
WHERE c.PART_ID IS null -- 非分区表信息
AND d.PARAM_KEY = "numFiles"
AND g.PARAM_KEY = "totalSize"
ORDER BY b.NAME asc, a.TBL_ID ASC
) AS tmp
GROUP BY tmp.dbName
,tmp.TBL_ID
,tmp.TBL_NAME
) AS tmp
GROUP BY tmp.dbName