一、每天删除历史数据:hive表和hdfs数据
删除180天前(该天)的数据
#!/bin/bash
curr_date=`date -d "-1 day ${1}" +%Y-%m-%d`
echo 'current date: ' ${curr_date}
target_date=`date -d "${curr_date} 180 days ago" +%Y-%m-%d`
echo 'target date: ' ${target_date}
#需要删除的table 列表
myarr=("table_name")
#训练列表
for i in ${myarr[@]}
do
echo "hdfs_dir/database_name/$i/dt=${target_date}"
hdfs dfs -test -e hdfs_dir/database_name/$i/dt=${target_date}
if [ $? -eq 0 ] ;then
echo 'exist'
hive -e "alter table $i drop partition (dt='${target_date}')"
hdfs dfs -rmr "hdfs_dir/database_name/$i/dt=${target_date}"
else
echo 'MyWarning : Directory does not exist'
fi
done
其中,hdfs dfs -test -e path 根据路径状态返回结果:
-d: if the path given by the user is a directory, then it gives 0 output.
-e: if the path given by the user exists, then it gives 0 output.
-f: if the path given by the user is a file, then it gives 0 output.
-s: if the path given by the user is not empty, then it gives 0 output.
-z: if the file is zero length, then it gives 0 output.
参考 blog
二、删除hdfs 历史数据,根据时间戳
#首先将要删除的path 倒入 temp.txt文件
hdfs dfs -ls /tmp |grep com.jd.ad.datamill.algorithms >temp.txt
target_date="2019-06-30"
target_timestamp=$(date -d "$target_date" +%s)
#hdfs dfs -ls 返回的格式:权限 - 用户 用户所在组 文件size 日期(2019-06-25) 时间(11:12) path
cat temp.txt | while read quanxian temp user group size day hour filepath
do
#这段是删除45天之内的文件:45*24*60*60
#current_file_time="$day $hour"
#current_file_timestamp=$(date -d "$current_file_time" +%s)
#if [ $(($today_timestamp-$current_file_timestamp)) -ge $((45*24*60*60)) ];then
# echo "$(date +'%Y-%m-%d %H:%M:%S') $filepath"
# $HADOOP_BIN_PATH/hadoop fs -rm -r $filepath > /dev/null 2>&1
#fi
#这段是删除target_timestamp之前的所有数据
file_timestamp=$(date -d "$day" +%s)
if [ "${target_timestamp}" -gt ${file_timestamp} ];then
echo ${day}
echo ${filepath}
hdfs dfs -rm -r ${filepath}
fi
done
"""
hdfs dfs -ls /tmp |grep com.jd.ad.datamill.algorithms >temp.txt
"""
"""
hdfs dfs -rm -r hdfs://ns1018/user/jd_ad/ads_dm/.Trash/Current/tmp/com.jd.ad.datamill.algorithms*
"""
三、查询hdfs 信息: block和stripe等
文件整体情况:hdfs fsck hdfs_path
文件stripe信息,schema等查看:hive --orcfiledump hdfs_file