#!/bin/bash
current_path= ( p w d ) e c h o " 当前目录: (pwd) echo "当前目录: (pwd)echo"当前目录:current_path"
if [ ! -d “${current_path}/tmp_image/” ]; then
mkdir -p $current_path/tmp_image/
echo “创建tmp_image目录”
else
echo “tmp_image目录已经存在,请手动删除”
exit
fi
echo “正在采集Fsimage文件”
hdfs dfsadmin -fetchImage ${current_path}/tmp_image
if ls $current_path/tmp_image/fsimage_0* 1> /dev/null 2>&1; then
echo “存在以fsimage_0开头的文件”
# 获取符合条件的文件名
ovipath=$current_path/tmp_image/fsimage_0\*
echo "符合条件的文件:" $ovipath
filename=$(ls $ovipath)
echo "文件名:" $filename
hdfs oiv -i "$filename" -o $current_path/tmp_image/fsimage.csv -p Delimited
echo "成功解析文件并导出为fsimage.csv,路径是: $current\_path/tmp\_image/fsimage.csv"
sed -i -e "1d" $current_path/tmp_image/fsimage.csv
else
echo “不存在以fsimage_0开头的文件”
fi
## 二、统计小文件情况
### 1.FsImage文件字段含义
| 字段 | 注释 |
| --- | --- |
| Path | HDFS路径 |
| Replication | 副本数 |
| ModificationTime | 最近修改时间 |
| AccessTime | 最近访问时间 |
| PerferredBlockSize | block size |
| BlocksCount | 块总数 |
| FileSize | 文件大小 |
| NSQUOTA | 名称配额 |
| DSQUOTA | 监控配额 |
| Permission | 文件权限 |
| UserName | 所属用户 |
| GroupName | 所属用户组 |
### 2.本地文件导入hive表
#### 2.1 创建hive外表
CREATE TABLE tmp.fsimage_info_csv
(
path
string,
replication
int,
modificationtime
string,
accesstime
string,
preferredblocksize
bigint,
blockscount
int,
filesize
bigint,
nsquota
string,
dsquota
string,
permission
string,
username
string,
groupname
string)
ROW FORMAT SERDE
‘org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe’
WITH SERDEPROPERTIES (
‘field.delim’=‘,’,
‘serialization.format’=‘,’)
STORED AS INPUTFORMAT
‘org.apache.hadoop.mapred.TextInputFormat’
OUTPUTFORMAT
‘org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat’
LOCATION
‘hdfs://bigdata/user/hive/warehouse/fsimage_info_csv’;
将步骤1中解析到的fsimage.csv文件上传到hdfs里:
hdfs dfs -put ./tmp_image/fsimage.csv hdfs://bigdata/user/hive/warehouse/fsimage_info_csv/
#### 2.2 分析统计
SELECT
dir_path ,
COUNT(*) AS small_file_num
FROM
( SELECT
relative_size,
dir_path
FROM
( SELECT
(
CASE filesize < 4194304
WHEN TRUE
THEN ‘small’
ELSE ‘large’
END) AS relative_size,
concat(‘/’,split(PATH,‘/’)[1], ‘/’,split(PATH,‘/’)[2], ‘/’,split(PATH,‘/’)[3], ‘/’,split(PATH,‘/’)[4], ‘/’, split(PATH,‘/’)[5], split(PATH,‘/’)[6]) AS dir_path
FROM
tmp.fsimage_info_csv
–WHERE
– replication = 0 and path like ‘/hive/warehouse/%’
) t1
网上学习资料一大堆,但如果学到的知识不成体系,遇到问题时只是浅尝辄止,不再深入研究,那么很难做到真正的技术提升。
一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!
😕/bbs.csdn.net/topics/618545628)**
一个人可以走的很快,但一群人才能走的更远!不论你是正从事IT行业的老鸟或是对IT行业感兴趣的新人,都欢迎加入我们的的圈子(技术交流、学习资源、职场吐槽、大厂内推、面试辅导),让我们一起学习成长!