以下操作基于hadoop3.1.2、hive3.1.2,其他版本字段有差异
1、通过hdfs命令导出csv格式文件
hdfs dfsadmin -fetchImage myfile
hdfs oiv -i myfile -o fsimage.csv -p Delimited
2、到hive建外表及统计
CREATE EXTERNAL TABLE default.hdfs_info(
Path string,
Replication string,
ModificationTime string,
AccessTime string,
PreferredBlockSize string,
BlocksCount string,
FileSize string,
NSQUOTA string,
DSQUOTA string,
Permission string,
UserName string,
GroupName string)
row format delimited fields terminated by '\t'
LOCATION
'hdfs://qingfeng_cluster:8020/tmp/prod_hdfs';
统计一级目录大小
select joinedpath, sumsize
from
(
select joinedpath,round(sum(filesize)/1024/1024/1024,2) as sumsize
from
(select concat('/',split(path,'\/')[1]) as joinedpath,accesstime,filesize,UserName
from default.hdfs_info
)t
group by joinedpath
)h
order by sumsize desc;
统计二级目录大小
select joinedpath, sumsize
from
(
select joinedpath,round(sum(filesize)/1024/1024/1024,2) as sumsize
from
(select concat('/',split(path,'\/')[1],'/',split(path,'\/')[2]) as joinedpath,accesstime,filesize,UserName
from default.hdfs_info
)t
group by joinedpath
)h
order by sumsize desc;
三级目录下小于100k文件统计
SELECT concat('/',split(path,'\/')[1],'/',split(path,'\/')[2],'/',split(path,'\/')[3]) as path ,count(*) as small_file_num
FROM
(SELECT relative_size,path
FROM
(SELECT (case filesize < 100*1024 WHEN true THEN 'small' ELSE 'large' end)
AS
relative_size, path
FROM default.hdfs_info) tmp
WHERE
relative_size='small') tmp2
group by concat('/',split(path,'\/')[1],'/',split(path,'\/')[2],'/',split(path,'\/')[3])
order by small_file_num desc;
其他各级目录小文件统计
SELECT joinedpath,
from_unixtime(ceil(accesstime/1000),'yyyy-MM-dd HH:mm:ss') AS accesstime,
from_unixtime(ceil(modificatetime/1000),'yyyy-MM-dd HH:mm:ss') AS modificatetime,
sumsize
FROM
(SELECT joinedpath,
min(accesstime) AS accesstime,
max(modificatetime) AS modificatetime,
round(sum(filesize)/1024/1024/1024,2) AS sumsize
FROM
(SELECT concat('/',split(path,'\/')[1],'/',split(path,'\/')[2],'/',split(path,'\/')[3],'/',split(path,'\/')[4],'/',split(path,'\/')[5]) AS joinedpath,
accesstime,
modificatetime,
filesize,
UserName
FROM default.hdfs_info
WHERE concat('/',split(path,'\/')[1],'/',split(path,'\/')[2],'/',split(path,'\/')[3],'/',split(path,'\/')[4])='/user/hive/warehouse/default.db')t
WHERE joinedpath != 'null'
GROUP BY joinedpath)h
ORDER BY sumsize DESC;