将show.log上传至hdfs的目录下
/log/show/20160101/show.log
两种方式统计前10项最高的点击用户
awk '{a[$1]+=1;}END{for(i in a){print a[i]" " i;}}' show.log |sort -n -r | head -n 10
awk '{print $1}' show.log | sort | uniq -c | sort -k1,1nr | head -10
显示为
9 175ed94b25634d279267ee7f4bc78fe91af1a4da
6 cdb553b08426430183802d085754d3512cb0970a
4 a389c51499484f84b71d26f6a2931edb96825a69
4 822e60d2b699496195d24dc5ddbf92fbac598af3
3 fe606002356c4df1923083c7eeb31e74b4b0502e
3 f8069b0db91345758421e3f781279c78b66360f1
3 ecddeb81f46b4f718cc5c6214a7c495ab88b4a6d
3 e0351f9382c14c008047c13713f5f2e3f7341fad
3 d5b589a6e66c45da8ea117c7c2e860cb0dab4812
3 cc6b23ba6e5846f8b49653b97e171a8c6d70d8f7
用hadoop命令将执行的结果输出到hdfs中
adoop jar /home/hadoop/app/hadoop-2.6.0/share/hadoop/tools/lib/hadoop-streaming-2.6.0.jar -input /log/show/20160101 -output /user/wuyuezhen/output20160416 -mapper 'awk "{print $1}"' -reducer "awk '{sum[\$1]++}END{for(key in sum) print key\"\t\"sum[key]}'"
使用浏览器通过python脚本查看内容
安装mysql服务端
yum install -y mysql-server mysql mysql-devel
进入hive命令行,在test数据库里,创建两个表
create table statpv_00 (
`pv` int(10),
`date` datetime
);
create table stattop_00 (
`userid` varchar(100),
`pv` int(10),
`date` datetime
);
在/home/hadoop/app路径下建立load.sh文件,赋予执行权限
chmod +x load.sh
vi load.sh
#!/bin/bash
MYPATH=`pwd`
HADOOPCMD="/home/hadoop/app/hadoop-2.6.0/bin/hadoop"
$HADOOPCMD fs -cat /user/wuyuezhen/output20160416/* | /usr/bin/awk '{sum+=$2}END{print sum"\t20160101"}' > statpv.20160101
$HADOOPCMD fs -cat /user/gaojunxiu/output20160416/* | /usr/bin/awk '{sum[$1]=$2}END{for(key in sum) print key"\t"sum[key]"\t20151201"}' | sort -k2,2 -n -r | head -n 10 > stattop.20160101
mysql -uhive -phive -e "use test;
load data local infile '$MYPATH/statpv.20160101' into table statpv_00;
load data local infile '$MYPATH/stattop.20160101' into table stattop_00;"
执行脚本,会出现两个文件
./load.sh
在/home/hadoop/app目录下建立文件夹,创建文件,赋权限
mkdir cgi-bin
cd cgi-bin
vi show.py
chmod +x show.py
#!/bin/bash
MYPATH=`pwd`
HADOOPCMD="/home/hadoop/app/hadoop-2.6.0/bin/hadoop"
$HADOOPCMD fs -cat /user/wuyuezhen/output20160416/* | /usr/bin/awk '{sum+=$2}END{print sum"\t20160101"}' > statpv.20160101
$HADOOPCMD fs -cat /user/wuyuezhen/output20160416/* | /usr/bin/awk '{sum[$1]=$2}END{for(key in sum) print key"\t"sum[key]"\t20151201"}' | sort -k2,2 -n -r | head -n 10 > stattop.20160101
mysql -uhive -phive -e "use test;
load data local infile '$MYPATH/statpv.20160101' into table statpv_00;
load data local infile '$MYPATH/stattop.20160101' into table stattop_00;"
在linux命令行里输入python -m CGIHTTPServer
在浏览器里输入ip:8000/cgi-bin/show.py可以访问到前10行
python -m SimpleHTTPServer 8000
此命令为通过一个简单的接口可以访问,下载的静态页面小例子,并且每点一次,会通过后台返回一个ip,可以用于统计用户数