实现内容:
hdfs 提取元数据mysql中hive元数据 及hdfs fsimage文件信息用于统计某个表下面有多少文件数 ,文件大小,平均值大小 用于分析小文件
脚本内容:
#!/bin/bash
#设置脚本输出及异常自动退出
set -x
set -e
current_time=`date '+%Y-%m-%d %H:%M:%S'`
echo "start time: $current_time"
cd /data12/wqg/fsimage/
rm -rf /data12/wqg/fsimage/fsimage_*
if [ ! -f "/data12/wqg/fsimage/fsimage.csv" ];then
echo "fsimage.csv文件不存在"
else
echo "fsimage.csv文件存在需要删除"
rm -rf /data12/wqg/fsimage/fsimage.csv
fi
#激活票据
kinit -kt /data12/wqg/keytab/hdfs.keytab hdfs@MYCDH
#提取hdfs fsimage元数据文件
hdfs dfsadmin -fetchImage /data12/wqg/fsimage/
files=`ls /data12/wqg/fsimage/fsimage_*`
for filename in $files
do
fsimage_name=$filename
echo "$current_time>> fsimage名称:$filename"
done
#设置内存30G(最大使用)
export HADOOP_OPTS="-Xmx30G"
#将fsimage文件数据 转换为csv格式 可视的文件
hdfs oiv -i /data12/wqg/fsimage/$fsimage_name -o /data12/wqg/fsimage/fsimage.csv -p Delimited
cd /data12/wqg/fsimage/
if [ ! -f "/data12/wqg/fsimage/fsimage.csv" ];then
echo "fsimage.csv文件不存在 无法执行上传"
else
sed -i -e "1d" /data12/wqg/fsimage/fsimage.csv
#上传到hdfs上
hadoop fs -put -f /data12/wqg/fsimage/fsimage.csv /tmp/rock/fsimg/
fi
#激活hive票据
kinit -kt /data12/wqg/keytab/hive.keytab hive@MYCDH
echo "$current_time: end fetimage-----------"
echo "$current_time: mysql 元数据创建表操作"
mysql -h(ip地址) -P(端口号) -u用户名 -p(密码) 库名 << EOF
use test;
drop table test.hive_tab_dir_detail;
create table test.hive_tab_dir_detail as
select 0 flag,c.NAME,c.DB_LOCATION_URI,a.TBL_NAME,a.CREATE_TIME,a.OWNER,a.TBL_TYPE,b.LOCATION
from hive.TBLS a,hive.SDS b,hive.DBS c
where a.SD_ID=b.SD_ID and a.DB_ID=c.DB_ID
and a.tbl_id not in (select d.tbl_id from hive.partitions d where a.tbl_id=d.tbl_id )
union all
select 1 flag,c.NAME,c.DB_LOCATION_URI,a.TBL_NAME,a.CREATE_TIME,a.OWNER,a.TBL_TYPE,concat(b.LOCATION,'/',d.part_name) LOCATION
from hive.TBLS a,hive.SDS b,hive.DBS c,hive.partitions d
where a.SD_ID=b.SD_ID and a.DB_ID=c.DB_ID and a.tbl_id = d.tbl_id
and a.tbl_id in (select d.tbl_id from hive.partitions d where a.tbl_id=d.tbl_id );
EOF
#将数据抽取至hdfs
echo "$current_time: start sqoop"
sqoop import \
--connect "jdbc:mysql://ip:port/test" \
--username root \
--password admin123 \
--table hive_tab_dir_detail \
--fields-terminated-by ',' \
--delete-target-dir \
--target-dir /tmp/wqg/hive_meta_tab_detail \
--m 1
echo "$current_time: start hive -e"
#没有启动beeline 或者kerberos 直接用hive -e
beeline -u "jdbc:hive2://ip:port/default;principal=hive/主体" -e "
use rhao;
#创建mysql中对应的元数据表
CREATE EXTERNAL TABLE if not exists rhao.wqg_hive_meta_tab_detail(
flag string,
name string,
db_location_uri string,
tbl_name string,
create_time bigint,
owner string,
tbl_type string,
location string
)
row format delimited fields terminated by ','
LOCATION '/tmp/wqg/hive_meta_tab_detail';
#创建hdfs fsimage对应的表
CREATE EXTERNAL TABLE if not exists rhao.HDFS_META_TEMP(
PATH STRING,
REPL INT,
MODIFICATION_TIME STRING,
ACCESSTIME STRING,
PREFERREDBLOCKSIZE INT,
BLOCKCOUNT DOUBLE,
FILESIZE DOUBLE,
NSQUOTA INT,
DSQUOTA INT,
PERMISSION STRING,
USERNAME STRING,
GROUPNAME STRING
)
row format delimited fields terminated by '\t'
LOCATION '/tmp/rock/fsimg/';
drop table rhao.hdfs_dir_stat_2;
create table rhao.hdfs_dir_stat_2 as
select regexp_extract(regexp_extract(path,'(\/([^\/]+\/)+)',1),'((\/[^\/]+)+)',1) dir,
modification_time,
accesstime,
count(1) file_nums,
sum(blockcount) blockcounts,
sum(filesize) filesizes,
sum(filesize)/count(1) avg_filesize
from rhao.hdfs_meta_temp where repl <> 0
group by regexp_extract(regexp_extract(path,'(\/([^\/]+\/)+)',1),'((\/[^\/]+)+)',1),modification_time,accesstime;
drop table rhao.hive_last_dir_stat_all_2;
create table rhao.hive_last_dir_stat_all_2 as
select
a.name db_name,
a.tbl_name table_name,
a.create_time,
a.tbl_type tbl_type ,
regexp_replace(a.location,'hdfs://nameservice1','') location,
b.modification_time,
b.accesstime,
b.file_nums,
b.blockcounts,
b.filesizes,
b.avg_filesize
from rhao.wqg_hive_meta_tab_detail a , rhao.hdfs_dir_stat_2 b
where regexp_replace(a.location,'hdfs://nameservice1','') = b.dir;
drop table rhao.hive_tab_dir_stat_sort_all_2;
create table rhao.hive_tab_dir_stat_sort_all_2 as
select db_name,
table_name,
create_time,
modification_time,
accesstime,
tbl_type,
sum(file_nums) file_nums,
sum(blockcounts) blockcounts,
sum(filesizes) filesizes,
sum(filesizes)/sum(file_nums) avg_filesizes
from rhao.hive_last_dir_stat_all_2
group by db_name,table_name,tbl_type,create_time,modification_time,accesstime
sort by file_nums desc;
create table rhao.hive_tab_dir_stat_sort_all_2_sort as
select db_name,
table_name,
from_unixtime(create_time,'yyyy-MM-dd HH:mm:ss') create_time,
tbl_type,
max(from_unixtime(unix_timestamp(concat(a.modification_time,':00')),'yyyy-MM-dd HH:mm:ss')) modification_time,
max(from_unixtime(unix_timestamp(concat(a.accesstime,':00')),'yyyy-MM-dd HH:mm:ss')) accesstime,
sum(file_nums) file_nums,
sum(blockcounts) blockcounts,
sum(filesizes) filesizes,
sum(filesizes)/sum(file_nums) avg_filesizes
from rhao.hive_last_dir_stat_all_2 a
group by db_name,table_name,tbl_type,create_time
sort by file_nums desc;
"
echo "end time: $current_time"