分析小文件信息

最新推荐文章于 2023-07-06 13:31:22 发布

沧海寄馀生

最新推荐文章于 2023-07-06 13:31:22 发布

阅读量160

点赞数

文章标签： hadoop

本文链接：https://blog.csdn.net/sx157559322/article/details/109192031

版权

实现内容:
hdfs 提取元数据mysql中hive元数据及hdfs fsimage文件信息用于统计某个表下面有多少文件数，文件大小，平均值大小用于分析小文件

脚本内容:

#!/bin/bash
#设置脚本输出及异常自动退出
set -x
set -e
current_time=`date '+%Y-%m-%d %H:%M:%S'`
echo "start time: $current_time"
cd /data12/wqg/fsimage/
rm -rf /data12/wqg/fsimage/fsimage_*
if [ ! -f "/data12/wqg/fsimage/fsimage.csv" ];then
echo "fsimage.csv文件不存在"
else
echo "fsimage.csv文件存在需要删除"
rm -rf /data12/wqg/fsimage/fsimage.csv
fi
#激活票据
kinit -kt /data12/wqg/keytab/hdfs.keytab hdfs@MYCDH
#提取hdfs fsimage元数据文件
hdfs dfsadmin -fetchImage /data12/wqg/fsimage/
files=`ls /data12/wqg/fsimage/fsimage_*`
for filename in $files
do
 fsimage_name=$filename
 echo "$current_time>> fsimage名称:$filename" 
done
#设置内存30G(最大使用)
export HADOOP_OPTS="-Xmx30G"
#将fsimage文件数据 转换为csv格式 可视的文件
hdfs oiv -i /data12/wqg/fsimage/$fsimage_name -o /data12/wqg/fsimage/fsimage.csv -p Delimited
cd /data12/wqg/fsimage/
if [ ! -f "/data12/wqg/fsimage/fsimage.csv" ];then
echo "fsimage.csv文件不存在 无法执行上传"
else
sed -i -e "1d" /data12/wqg/fsimage/fsimage.csv
#上传到hdfs上
hadoop fs -put -f  /data12/wqg/fsimage/fsimage.csv /tmp/rock/fsimg/
fi
#激活hive票据
kinit -kt /data12/wqg/keytab/hive.keytab hive@MYCDH
echo "$current_time: end fetimage-----------" 

echo "$current_time: mysql 元数据创建表操作" 

mysql -h(ip地址) -P(端口号) -u用户名 -p(密码) 库名 << EOF
use test;
drop table test.hive_tab_dir_detail;
create table test.hive_tab_dir_detail as 
select 0 flag,c.NAME,c.DB_LOCATION_URI,a.TBL_NAME,a.CREATE_TIME,a.OWNER,a.TBL_TYPE,b.LOCATION 
from hive.TBLS a,hive.SDS b,hive.DBS c
where a.SD_ID=b.SD_ID and a.DB_ID=c.DB_ID
  and a.tbl_id not in (select d.tbl_id from hive.partitions d where a.tbl_id=d.tbl_id )
union all
select 1 flag,c.NAME,c.DB_LOCATION_URI,a.TBL_NAME,a.CREATE_TIME,a.OWNER,a.TBL_TYPE,concat(b.LOCATION,'/',d.part_name) LOCATION
from hive.TBLS a,hive.SDS b,hive.DBS c,hive.partitions d
where a.SD_ID=b.SD_ID and a.DB_ID=c.DB_ID and a.tbl_id = d.tbl_id 
and a.tbl_id in (select d.tbl_id from hive.partitions d where a.tbl_id=d.tbl_id );
EOF
#将数据抽取至hdfs
echo "$current_time: start sqoop"
sqoop import \
--connect "jdbc:mysql://ip:port/test" \
--username root \
--password admin123 \
--table hive_tab_dir_detail \
--fields-terminated-by ',' \
--delete-target-dir \
--target-dir /tmp/wqg/hive_meta_tab_detail \
--m 1

echo "$current_time: start hive -e"
#没有启动beeline 或者kerberos 直接用hive -e
beeline -u "jdbc:hive2://ip:port/default;principal=hive/主体" -e "
use rhao;
#创建mysql中对应的元数据表
CREATE EXTERNAL TABLE if not exists rhao.wqg_hive_meta_tab_detail(
   flag string,
   name string,                       
   db_location_uri string,              
   tbl_name string,
   create_time bigint,                        
   owner string,
   tbl_type string, 
   location string
) 
row format delimited fields terminated by ','
LOCATION '/tmp/wqg/hive_meta_tab_detail';
#创建hdfs fsimage对应的表
CREATE EXTERNAL TABLE if not exists rhao.HDFS_META_TEMP(
    PATH STRING, 
    REPL INT,
    MODIFICATION_TIME STRING,
    ACCESSTIME STRING,
    PREFERREDBLOCKSIZE INT,
    BLOCKCOUNT DOUBLE,
    FILESIZE DOUBLE,
    NSQUOTA INT,
    DSQUOTA INT,
    PERMISSION STRING,
    USERNAME STRING,
    GROUPNAME STRING
) 
row format delimited fields terminated by '\t'
LOCATION '/tmp/rock/fsimg/';


drop table rhao.hdfs_dir_stat_2;
create table rhao.hdfs_dir_stat_2 as
select regexp_extract(regexp_extract(path,'(\/([^\/]+\/)+)',1),'((\/[^\/]+)+)',1) dir,
modification_time,
accesstime,
count(1) file_nums,
sum(blockcount) blockcounts,
sum(filesize) filesizes,
sum(filesize)/count(1) avg_filesize
from rhao.hdfs_meta_temp where repl <> 0
group by regexp_extract(regexp_extract(path,'(\/([^\/]+\/)+)',1),'((\/[^\/]+)+)',1),modification_time,accesstime;

drop table rhao.hive_last_dir_stat_all_2;
create table rhao.hive_last_dir_stat_all_2 as
select 
a.name db_name,
a.tbl_name table_name,
a.create_time,
a.tbl_type  tbl_type ,
regexp_replace(a.location,'hdfs://nameservice1','') location,
b.modification_time,
b.accesstime,
b.file_nums,
b.blockcounts,
b.filesizes,
b.avg_filesize
from rhao.wqg_hive_meta_tab_detail a , rhao.hdfs_dir_stat_2 b
where regexp_replace(a.location,'hdfs://nameservice1','') = b.dir;

drop table rhao.hive_tab_dir_stat_sort_all_2;
create table rhao.hive_tab_dir_stat_sort_all_2 as
select db_name, 
table_name,
create_time,
modification_time,
accesstime,
tbl_type,
sum(file_nums) file_nums, 
sum(blockcounts) blockcounts, 
sum(filesizes) filesizes, 
sum(filesizes)/sum(file_nums) avg_filesizes 
from rhao.hive_last_dir_stat_all_2
group by db_name,table_name,tbl_type,create_time,modification_time,accesstime
sort by file_nums desc;

create table rhao.hive_tab_dir_stat_sort_all_2_sort as
select db_name,   
table_name,  
from_unixtime(create_time,'yyyy-MM-dd HH:mm:ss') create_time, 
tbl_type, 
max(from_unixtime(unix_timestamp(concat(a.modification_time,':00')),'yyyy-MM-dd HH:mm:ss')) modification_time, 
max(from_unixtime(unix_timestamp(concat(a.accesstime,':00')),'yyyy-MM-dd HH:mm:ss')) accesstime,
sum(file_nums) file_nums,
sum(blockcounts) blockcounts,
sum(filesizes) filesizes,
sum(filesizes)/sum(file_nums) avg_filesizes 
from rhao.hive_last_dir_stat_all_2 a
group by db_name,table_name,tbl_type,create_time
sort by file_nums desc;
"
echo "end time: $current_time"