我的shell脚本代码 : 外部是crontab 定时器
#!/bin/bash
#
# ===========================================================================
# 程序名称: 点击流数据分析 /周期为小时
# 功能描述:
# 输入参数: 运行日期
# 创建人 : 何睿
# 创建日期: 2017-12-21
# 版本说明: v1.0
# 代码审核:
# 修改人名:
# 修改日期:
# 修改原因:
# 修改列表:
# ===========================================================================
exe_hive="/root/training/apache-hive-1.2.1-bin/bin/hive"
#--------------------------------1.获取时间----------------------------------
dayexec=`date -d '-1 hour' +'%Y-%m-%d %H-00-00'`
#根据参数有无给定时间
if [ -z "$1" ]
then
day=$dayexec
else
day=$1
fi
echo "...................shell 脚本执行中 处理的时间是 $day"
#------------------------2.执行MR 清洗日志到loghive目录-----------------------
echo "...................shell 脚本执行中 执行MapReduce 输出路径为/loghive/"
#数据清洗程序类名
progress_class="cn.log.mr.VisitLogDriver"
#待处理日志的存放目录
log_inputPath="/visit/time=$day"
#处理完数据的存放目录
log_outputPath="/loghive/time=$day"
#读取日志文件的目录,判断日志是否存在
files=`hadoop fs -ls "$log_inputPath" | wc -l`
#判断是否有日志文件
if [ $files -gt 0 ];then
#执行mr进行数据清洗
echo "running.. hadoop jar visitlog.jar $log_inputPath $log_outputPath"
hadoop jar /work/visitlog.jar "$log_inputPath" "$log_outputPath"
else
echo "MapReduce 输入路径 还没有原始日志文件 具体原因请详细排查"
exit
fi
#------------------------3.hive处理,增加分区(进入visitlog表)-----------------------
addpartitionHQL="
use visit;
alter table visitlog add partition(time='$day') location '/loghive/time=$day';
"
$exe_hive -e "$addpartitionHQL"
if [ $? -eq 0 ]; then
echo "增加分区(进入visitlog表) success"
else
echo "增加分区(进入visitlog表) fail"
exit
fi
#-----------------------4.hive处理,数据清洗(进入clearlog表)-----------------------
clearHQL="
use visit;
insert into table clearlog partition(time='$day') select url,urlname,title,encoding,size,color,language,forjvm,forcookie,flash,refurl,browser,uv,split(sessionInfo,'_')[0],split(sessionInfo,'_')[1],split(sessionInfo,'_')[2],ip from visitlog where time='$day';
"
$exe_hive -e "$clearHQL"
if [ $? -eq 0 ]; then
echo "数据清洗(进入clearloglog表) success"
else
echo "数据清洗(进入clearlog表) fail"
exit
fi
#------------------------5.hive处理, 最终数据(进入visitAnalysis表)-----------------------
analysisHQL="
use visit;
insert overwrite table loganalysis
partition(time='$day')
select t1.pv,t2.uv1,t3.vv,t4.newip,t5.br1,t6.avgtimes,t7.newcust,t8.avgdept
from
(select count(*) pv from clearlog where time='$day') t1,
(select count(distinct uv) uv1 from clearlog where time='$day') t2,
(select count(distinct session) vv from clearlog where time='$day') t3,
(select count(distinct ip) newip from clearlog where ip not in (select distinct ip oldip from clearlog cl where time <> '$day'))t4,
(select round(ta.sonnum/tb.momnum,2) br1 from (select count(*) sonnum from (select ip,session,count(*) from clearlog where time='$day' group by ip,session having count(*)=1) c)ta,(select count(distinct session) momnum from clearlog where time='$day')tb)t5,
(select round(avg(alltime)/60,2) avgtimes from (select (max(times)-min(times))/1000 alltime from clearlog where time='$day' group by session ) t)t6,
(select count(distinct uv) newcust from clearlog where time='$day' and uv not in (select uv u from clearlog cl1 where time <> '$day'))t7,
(select round(sum(deep)/count(*),2) avgdept from (select max(step) deep from clearlog group by session) d)t8;
"
$exe_hive -e "$analysisHQL"
if [ $? -eq 0 ]; then
echo "最终数据(进入visitAnalysis表) success"
else
echo "最终数据(进入visitAnalysis表) fail"
exit
fi
echo "$day 日志处理完毕................................"
三张表提前生成好的。 一张external 外部表、两张orc 内部表