1、脚本目录路径
[spark@Master data_dir]$ pwd
/home/spark/opt/data_dir
[spark@Master data_dir]$ ls -R /home/spark/opt/data_dir
/home/spark/opt/data_dir:
weixin_data.sh yemao_log_20151214.csv yemao_log_20151215.csv yesterday.list
2、Shell批处理脚本内容
[spark@Master data_dir]$ cat weixin_data.sh
#/bin/bash
#echo -n "please enter a day for runing :"
#read yesterday
while read yesterday
do
cd /home/spark/opt/data_dir/
/home/spark/opt/mongodb-2.7.0/bin/mongoexport -d yemao -c yemao_log_$yesterday --csv --query '{"browser":/MicroMessenger/}' -f id,time,url_from,url_current,url_to,token -o /home/spark/opt/data_dir/yemao_log_$yesterday.csv
/usr/local/mysql/bin/mysql -udatahouse -pDTHS2016 -h120.55.189.188 -P3306 --default-character-set=utf8 -e "use logdata; LOAD DATA LOCAL INFILE '/home/spark/opt/data_dir/yemao_log_$yesterday.csv' INTO TABLE yemao_weixin_log FIELDS TERMINATED BY ',' IGNORE 1 LINES;";
done < /home/spark/opt/data_dir/yesterday.list
3、存放日期的参数文件
[spark@Master data_dir]$ cat yesterday.list
20151214
20151215
4、Mysql端对应的表结构
show create table yemao_weixin_log;
CREATE TABLE `yemao_weixin_log` (
`id` int(20) DEFAULT NULL,
`time` varchar(200) DEFAULT NULL,
`url_from` varchar(200) DEFAULT NULL,
`url_current` varchar(200) DEFAULT NULL,
`url_to` varchar(200) DEFAULT NULL,
`token` varchar(200) DEFAULT NULL
) ENGINE=InnoDB DEFAULT CHARSET=utf8
5、根据token的不同计算uv
select a1.log_date 统计日期,count(*) uv,sum(a1.pv) pv from
(select FROM_UNIXTIME(time,'%Y%m%d') log_date,token,count(*) pv from yemao_weixin_log group by FROM_UNIXTIME(time,'%Y%m%d'),token) a1
group by a1.log_date
6、说明
mysql load data数据导入时过滤首行使用IGNORE 1 LINES选项;
存放日期的参数文件每个日期一行,可根据需要写入;
mongodb中Like过滤使用如:--query '{"browser":/MicroMessenger/}'所示选项; 因为方便起见,本文中的程序路径都写入了绝对路径,可确保调用的时候不出错。