#!/usr/bin/env bash
source /etc/profile
echo
"
*************** start filter ***************
"
# get befor six month last day
#m0=$(date -d "$(date -d 'month' +%Y%m01) -1 day" +%Y%m%d)
#echo ${m0}
#m1=$(date -d "$(date -d '0 month' +%Y%m01) -1 day" +%Y%m%d)
#echo ${m1}
#m2=$(date -d "$(date -d last-month +%Y%m01) -1 day" +%Y%m%d)
#echo ${m2}
#m3=$(date -d "$(date -d ${m2} +%Y%m01) -1 day" +%Y%m%d)
#echo ${m3}
#m4=$(date -d "$(date -d ${m3} +%Y%m01) -1 day" +%Y%m%d)
#echo ${m4}
#m5=$(date -d "$(date -d ${m4} +%Y%m01) -1 day" +%Y%m%d)
#echo ${m5}
#m6=$(date -d "$(date -d ${m5} +%Y%m01) -1 day" +%Y%m%d)
#echo ${m6}
# 取得当前月的最后一天,访问数组长度:${#m[*]} + ${#m[@]}
m
[
0
]=
$(date -d
"
$(date -d
'
month
'
+%Y%m01)
-1 day
"
+%Y%m%d)
echo
m0 :
${m[
0
]}
'
month :
'
${#m[
@
]}
for
n
in
$(seq
0
11
)
;
do
m
[
$n
+
1
]=
$(date -d
"
$(date -d ${m[$n]} +%Y%m01)
-1 day
"
+%Y%m%d)
echo
m
$[$n+
1
]
:
${m[$n
+
1
]}
'
month :
'
${#m[
*
]}
;
done
echo
"
****** time :
"
$(date
'
+%Y-%m-%d %H:%M:%S
'
)
"
******
"
max_date
=
0
# get the latest file and copy to hdfs
cd
/home/hadoop/streaming_run_center/tmp/checkpoint/filter
for
dir
in
$(
ls
-l ./ | awk
'
/^d/{print $NF}
'
)
do
if
[[
-d
$dir
&&
$dir
==
*\_*
]]
;
then
f_d
=
$(
echo
$dir
| cut -d \_ -f
3
| cut -d \. -f
1
)
if
[[
$max_date
<
$f_d
]]
;
then
max_date
=
$f_d
max_filter
=
$dir
fi
fi
done
echo
"
max date is :
"
$max_date
echo
"
max filter is :
"
$max_filter
pwd
# 复制最近日期的filter文件到hdfs
hadoop fs
-test
-e
/data/datacenter/run_center_spark_stream/bloom_filters/
$max_filter
if
[[
$?
==
0
]]
;
then
echo
"
filter is already exist :
"
$max_filter
else
echo
"
start hdfs copy
"
echo
"
****** start time :
"
$(date
'
+%Y-%m-%d %H:%M:%S
'
)
"
******
"
hadoop fs
-put
$max_filter
/data/datacenter/run_center_spark_stream/bloom_filters
echo
"
****** end time :
"
$(date
'
+%Y-%m-%d %H:%M:%S
'
)
"
******
"
fi
remove_week
=
$(date -d
"
$max_date
7 days ago
"
+%Y%m%d)
echo
"
删除本地序列化文件的日期界限:
"
$remove_week
remove_date
=
$(date -d
"
$max_date
30 days ago
"
+%Y%m%d)
echo
"
删除文件 和 Hadoop filter 的日期界限:
"
$remove_date
echo
"
*************** start remove filter ***************
"
for
r_dir
in
$(
ls
-l ./ | awk
'
/^d/{print $NF}
'
)
do
if
[[
-d
$r_dir
&&
$r_dir
==
*\_*
]]
;
then
r_d
=
$(
echo
$r_dir
| cut -d \_ -f
3
| cut -d \. -f
1
)
if
[[
$r_d
<
$remove_date
]]
;
then
if
[[
${m[
*
]}
==
*
$r_d
*
]]
;
then
cd
/home/hadoop/streaming_run_center/tmp/checkpoint/filter/
$r_dir
pwd
for
f_dir
in
$(
ls
*)
do
if
[[
"
$f_dir
"
==
"mau_device_all.FILTER.SER"
]]
;
then
echo
"
------ keep mau_filter is:
"
$f_dir
;
else
echo
"
remove file is:
"
$f_dir
;
rm
-r
$f_dir
fi
done
cd
/home/hadoop/streaming_run_center/tmp/checkpoint/filter
pwd
else
echo
"
remove filter_dir is:
"
$r_dir
rm
-r
$r_dir
fi
elif
[[
$r_d
<
$remove_week
]]
;
then
if
[[
$r_d
==
$m0
||
$r_d
==
$m1
||
$r_d
==
$m2
]]
;
then
cd
/home/hadoop/streaming_run_center/tmp/checkpoint/filter/
$r_dir
pwd
for
f_dir
in
$(
ls
*)
do
if
[[
"
$f_dir
"
==
"mau_device_all.FILTER.SER"
]]
;
then
echo
"
------ week keep mau_filter is:
"
$f_dir
;
else
if
[[
"
$f_dir
"
==
*.FILTER.SER
]]
;
then
echo
"
- last day of month - week remove file is:
"
$f_dir
;
rm
-r
$f_dir
fi
fi
done
cd
/home/hadoop/streaming_run_center/tmp/checkpoint/filter
pwd
else
echo
"
week remove filter is:
"
$r_dir
rm
-r
$r_dir
/*.FILTER.SER
fi
fi
fi
done
echo " =============== start remove hdfs filter =============== "
# 删除hdfs上指定日期外的tdid
for
h_filter
in
$(hadoop fs -ls /data/datacenter/run_center_spark_stream/bloom_filters | awk
'
{print $8}
'
)
do
if
[[
$h_filter
==
*\_*
]]
;
then
h_date
=
$(
echo
$h_filter
| cut -d \/ -f
6
| cut -d \_ -f
3
| cut -d \. -f
1
)
# echo " hdfs date : "$h_date
# echo " hdfs filter : "$h_filter
if
[[
${m[
*
]}
==
*
$h_date
*
]]
;
then
echo
"
remain hdfs filter is :
"
$h_filter
elif
[[
$h_date
<
$remove_date
]]
;
then
echo
"
not remain date is :
"
$h_date
echo
"
remove hdfs filter is :
"
$h_filter
hadoop fs
-rmr
$h_filter
fi
fi
done
echo
"
-------------- start tdid ---------------
"
# 删除小于30天的tdid
cd
/home/hadoop/streaming_run_center/tmp/checkpoint/tdidinfo
for
tdid
in
$(
ls
*)
do
if
[[
$tdid
==
*\_*
]]
;
then
t_d
=
$(
echo
$tdid
| cut -d \_ -f
2
| cut -d \. -f
1
)
if
[[
$t_d
==
$max_date
||
$t_d
>
$max_date
]]
;
then
echo
"
need copy date :
"
$t_d
echo
"
need copy tdid :
"
$tdid
# 检查tdid是否存在
# hadoop fs -test -e jiaojiao/tdid/$tdid
# if [[ $? == 0 ]]; then
# echo " tdid is already exist,remove it first "
# hadoop fs -rm jiaojiao/tdid/$tdid
# hadoop fs -put $tdid jiaojiao/tdid
# else
# echo " start copy "
# hadoop fs -put $tdid jiaojiao/tdid
# fi
elif
[[
$t_d
<
$remove_date
]]
;
then
echo
"
remove tdid :
"
$tdid
rm
$tdid
fi
fi
done
#echo " =============== start remove hdfs tdid =============== "
#for h_tdid in $(hadoop fs -ls jiaojiao/tdid | awk '{print $8}')
#do
# if [[ $h_tdid == *\_* ]]; then
# h_date=$(echo $h_tdid | cut -d \_ -f 2 | cut -d \. -f 1)
# echo $h_date
# echo $h_tdid
# fi
#done