项目描述
通过对黑马技术论坛的apache common日志进行分析,
计算论坛关键指标,供运营者决策。
数据情况
每行记录有5部分组成:
1.访问ip
2.访问时间
3.访问资源
4.访问状态
5.本次流量
more access_2013_05_30.log
27.19.74.143 - - [30/May/2013:17:38:20 +0800] “GET /static/image/common/faq.gif HTTP/1.1” 200 1127
110.52.250.126 - - [30/May/2013:17:38:20 +0800] “GET /data/cache/style_1_widthauto.css?y7a HTTP/1.1” 200 1292
27.19.74.143 - - [30/May/2013:17:38:20 +0800] “GET /static/image/common/hot_1.gif HTTP/1.1” 200 680
27.19.74.143 - - [30/May/2013:17:38:20 +0800] “GET /static/image/common/hot_2.gif HTTP/1.1” 200 682
27.19.74.143 - - [30/May/2013:17:38:20 +0800] “GET /static/image/filetype/common.gif HTTP/1.1” 200 90
110.52.250.126 - - [30/May/2013:17:38:20 +0800] “GET /source/plugin/wsh_wx/img/wsh_zk.css HTTP/1.1” 200 1482
110.52.250.126 - - [30/May/2013:17:38:20 +0800] “GET /data/cache/style_1_forum_index.css?y7a HTTP/1.1” 200 2331
110.52.250.126 - - [30/May/2013:17:38:20 +0800] “GET /source/plugin/wsh_wx/img/wx_jqr.gif HTTP/1.1” 200 1770
关键指标
- 浏览量PV
定义:页面浏览量即为PV(Page View),是指所有用户浏览页面的总和,一个独立用户每打开一个页面就被记录1 次。
分析:网站总浏览量,可以考核用户对于网站的兴趣,就像收视率对于电视剧一样。但是对于网站运营者来说,更重要的是,每个栏目下的浏览量。
计算公式:记录计数 - 注册用户数
计算公式:对访问member.php?mod=register的url,计数 - IP数
定义:一天之内,访问网站的不同独立IP 个数加和。其中同一IP无论访问了几个页面,独立IP 数均为1。
分析:这是我们最熟悉的一个概念,无论同一个IP上有多少电脑,或者其他用户,从某种程度上来说,独立IP的多少,是衡量网站推广活动好坏最直接的数据。
公式:对不同ip,计数 - 跳出率
定义:只浏览了一个页面便离开了网站的访问次数占总的访问次数的百分比,即只浏览了一个页面的访问次数 / 全部的访问次数汇总
计算公式:(1)统计一天内只出现一条记录的ip,称为跳出数
(2)跳出数/PV - 版块热度排行榜
定义:版块的访问情况排行。
分析:巩固热点版块成绩,加强冷清版块建设。同时对学科建设也有影响。
计算公式:按访问次数统计排序
开发步骤
1.使用hdfs shell把日志数据导入到hdfs中
2.对数据进行清洗
3.使用hive进行数据的多维分析
5.把hive分析结果使用sqoop导出到mysql中
6.使用脚本实现定时完成任务
1,数据上传
hadoop fs -put /apache_logs/* /hmbbs_logs
2,使用Mapreduce对HDFS中的原始数据进行清洗,过滤数据,并把结果输出到hdfs
//过滤静态信息
if(parsed[2].startsWith("GET /static/")||parsed[2].startsWith("GET /uc_server")){
return;
}
//去掉开头的特定格式字符串
if(parsed[2].startsWith("GET /")){
parsed[2] = parsed[2].substring("GET /".length());
}
else if(parsed[2].startsWith("POST /")){
parsed[2] = parsed[2].substring("POST /".length());
}
//过滤结尾的特定格式字符串
if(parsed[2].endsWith("HTTP/1.1")){
parsed[2] = parsed[2].substring(0, parsed[2].length()-"HTTP/1.1".length());
}
package hmbbs;
import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class HmbbsCleaner extends Configured implements Tool{
// static final String INPUT_PATH = "hdfs://chaoren:9000/hmbbs_logs/access_2013_05_30.log";
// static final String OUT_PATH = "hdfs://chaoren:9000/hmbbs_cleaned";
@Override
public int run(String[] args) throws Exception {
final Job job = new Job(new Configuration(), HmbbsCleaner.class.getSimpleName());
job.setJarByClass(HmbbsCleaner.class);
//1.1 指定输入文件路径
FileInputFormat.setInputPaths(job, args[0]);
//指定哪个类用来格式化输入文件
job.setInputFormatClass(TextInputFormat.class);
//1.2指定自定义的Mapper类
job.setMapperClass(MyMapper.class);
//指定输出<k2,v2>的类型
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
//2.2 指定自定义的reduce类
job.setReducerClass(MyReducer.class);
//指定输出<k3,v3>的类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
//2.3 指定输出到哪里
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//把代码提交给JobTracker执行
job.waitForCompletion(true);
return 0;
}
public static void main(String[] args) throws Exception{
ToolRunner.run(new HmbbsCleaner(), args);
}
static class MyMapper extends Mapper<LongWritable, Text, LongWritable, Text>{
LogParser logParser = new LogParser();
Text v2 = new Text();
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, LongWritable, Text>.Context context)
throws IOException, InterruptedException {
// 注:log格式如下
//27.19.74.143 - - [30/May/2013:17:38:20 +0800] "GET /static/image/common/faq.gif HTTP/1.1" 200 1127
String[] parsed = logParser.parse(value.toString());
//过滤静态信息
if(parsed[2].startsWith("GET /static/")||parsed[2].startsWith("GET /uc_server")){
return;
}
//去掉开头的特定格式字符串
if(parsed[2].startsWith("GET /")){
parsed[2] = parsed[2].substring("GET /".length());
}
else if(parsed[2].startsWith("POST /")){
parsed[2] = parsed[2].substring("POST /".length());
}
//过滤结尾的特定格式字符串
if(parsed[2].endsWith("HTTP/1.1")){
parsed[2] = parsed[2].substring(0, parsed[2].length()-"HTTP/1.1".length());
}
v2.set(parsed[0]+"\t"+parsed[1]+"\t"+parsed[2]+"\t");
context.write(key, v2);
}
}
static class MyReducer extends Reducer<LongWritable, Text, Text, NullWritable>{
@Override
protected void reduce(LongWritable k2, Iterable<Text> v2s,
Reducer<LongWritable, Text, Text, NullWritable>.Context context) throws IOException, InterruptedException {
for (Text v2 : v2s) {
context.write(v2, NullWritable.get());
}
}
}
}
class LogParser {
public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
SimpleDateFormat dateformat1=new SimpleDateFormat("yyyyMMddHHmmss");//添加
public static void main(String[] args) throws ParseException {
final String S1 = "27.19.74.143 - - [30/May/2013:17:38:20 +0800] \"GET /static/image/common/faq.gif HTTP/1.1\" 200 1127";
LogParser parser = new LogParser();
parser.parseDateFormat("30/May/2013:17:38:20");
final String[] array = parser.parse(S1);
System.out.println("样例数据: "+S1);
System.out.format("解析结果: ip=%s, time=%s, url=%s, status=%s, traffic=%s", array[0],array[1], array[2], array[3], array[4]);
// parser.parseDateFormat(array[1]).toLocaleString()被替换为array[1]
}
/**
* 解析英文时间字符串
* @param string
* @return
* @throws ParseException
*/
private Date parseDateFormat(String string){
Date parse = null;
try {
parse = FORMAT.parse(string);
} catch (ParseException e) {
e.printStackTrace();
}
return parse;
}
/**
* 解析日志的行记录
* @param line
* @return 数组含有5个元素,分别是ip、时间、url、状态、流量
*/
public String[] parse(String line){
String ip = parseIP(line);
String time = parseTime(line);
String url = parseURL(line);
String status = parseStatus(line);
String traffic = parseTraffic(line);
return new String[]{ip, time ,url, status, traffic};
}
private String parseTraffic(String line) {
final String trim = line.substring(line.lastIndexOf("\"")+1).trim();
String traffic = trim.split(" ")[1];
return traffic;
}
private String parseStatus(String line) {
final String trim = line.substring(line.lastIndexOf("\"")+1).trim();
String status = trim.split(" ")[0];
return status;
}
private String parseURL(String line) {
final int first = line.indexOf("\"");
final int last = line.lastIndexOf("\"");
String url = line.substring(first+1, last);
return url;
}
private String parseTime(String line) {
final int first = line.indexOf("[");
final int last = line.indexOf("+0800]");
String time = line.substring(first+1,last).trim();
Date date = parseDateFormat(time); //添加
return dateformat1.format(date);//添加
}
private String parseIP(String line) {
String ip = line.split("- -")[0].trim();
return ip;
}
}
3.把项目jar包上传到服务器
如果上面没有做数据上传操作,可以执行下面脚本完成数据上传和清洗任务,注意这个脚本要先赋予执行权限chmod u+x upload_to_hdfs
vi /apache_logs/upload_to_hdfs.sh
#!/bin/sh
#upload logs to hdfs
#get yesterday format string
#yesterday=`date --date='1 days ago' +%Y_%m_%d`
yesterday=$1 #这个1表示我们执行这个脚本时传进的第 一个参数
hadoop fs -put /apache_logs/access_${yesterday}.log /hmbbs_logs
hadoop jar /apache_logs/cleaned.jar /hmbbs_logs/access_${yesterday}.log /hmbbs_cleaned/${yesterday}
执行脚本
upload_to_hdfs.sh 2013_05_30
3.使用hive进行数据的多维分析
建立一个外部分区表
CREATE EXTERNAL TABLE hmbbs(ip string, atime string, url string) PARTITIONED BY (logdate string) ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' LOCATION '/hmbbs_cleaned';
增加分区
ALTER TABLE hmbbs ADD PARTITION(logdate='2013_05_30') LOCATION '/hmbbs_cleaned/2013_05_30';
统计每日的pv,代码如下
CREATE TABLE hmbbs_pv_2013_05_30 AS SELECT COUNT(1) AS PV FROM hmbbs WHERE logdate='2013_05_30';
统计每日的注册用户数,代码如下
CREATE TABLE hmbbs_reguser_2013_05_30 AS SELECT COUNT(1) AS REGUSER FROM hmbbs WHERE logdate='2013_05_30' AND INSTR(url,'member.php?mod=register')>0;
统计每日的独立ip,代码如下
CREATE TABLE hmbbs_ip_2013_05_30 AS SELECT COUNT(DISTINCT ip) AS IP FROM hmbbs WHERE logdate='2013_05_30';
统计每日的跳出用户,代码如下
CREATE TABLE hmbbs_jumper_2013_05_30 AS SELECT COUNT(1) AS jumper FROM (SELECT COUNT(ip) AS times FROM hmbbs WHERE logdate='2013_05_30' GROUP BY ip HAVING times=1) e;
```
把以上四张表的内容整合到一张表中,之后删除这四张表
```
把每天统计的数据放入一张表
CREATE TABLE hmbbs_2013_05_30 AS SELECT '2013_05_30', a.pv, b.reguser, c.ip, d.jumper FROM hmbbs_pv_2013_05_30 a JOIN hmbbs_reguser_2013_05_30 b ON 1=1 JOIN hmbbs_ip_2013_05_30 c ON 1=1 JOIN hmbbs_jumper_2013_05_30 d ON 1=1 ;
使用sqoop把数据导出到mysql中
首先在mysql中创建一张表
mysql>use hmbbs;
mysql> create table hmbbs_logs_stat(logdate varchar(24) , pv int , reguser int ,jumper int );
使用sqoop导出到Mysql
sqoop export --connect jdbc:mysql://chaoren:3306/hmbbs --username root --password admin --table hmbbs_logs_stat --fields-terminated-by '\001' --export-dir '/user/hive/warehouse/hmbbs_2013_05_30'
使用脚本完成以上任务
初始化脚本
系统刚上线的时候执行一次初始化脚本,hmbbs_init.sh 只需执行一次,它把咱们部署项目日期之前所有的日志数据进行清洗,这通过循环调用任务核心文件hmbbs_core.sh来完成,效果就是把过去每一 天的日志数据清洗好放到单独的一个hive目录下。
vi hmbbs.init.sh
#!/bin/sh
s1=`date --date="$1" +%s` # $s1是我们执行这个初始化脚本时传进来的以前的日志数据的起始日期!
s2=`date +%s`
s3=$((($s2-$s1)/3600/24)) #获得从$s1到今天的天数。
for ((i=$s3; i>0; i--)) #循环调用清洗文件来清洗过去的日志并导出到mysql
do
tmp=`date --date="$i days ago" +%Y_%m_%d`
hmbbs_core.sh $tmp
done
执行项目核心任务的脚本
vi hmbbs_core.sh
#!/bin/sh
#get yesterday format string
#yesterday=`date --date='1 days ago' +%Y_%m_%d`
yesterday=$1
#upload logs to hdfs
hadoop fs -put /apache_logs/access_${yesterday}.log /hmbbs_logs
#cleaning data
hadoop jar /apache_logs/cleaned.jar /hmbbs_logs/access_${yesterday}.log /hmbbs_cleaned/${yesterday} 1>/dev/null
#alter hive table and then add partition to existed table
hive -e "ALTER TABLE hmbbs ADD PARTITION(logdate='${yesterday}') LOCATION '/hmbbs_cleaned/${yesterday}';"
#create hive table everyday
hive -e "CREATE TABLE hmbbs_pv_${yesterday} AS SELECT COUNT(1) AS PV FROM hmbbs WHERE logdate='${yesterday}';"
hive -e "CREATE TABLE hmbbs_reguser_${yesterday} AS SELECT COUNT(1) AS REGUSER FROM hmbbs WHERE logdate='${yesterday}' AND INSTR(url,'member.php?mod=register')>0;"
hive -e "CREATE TABLE hmbbs_ip_${yesterday} AS SELECT COUNT(DISTINCT ip) AS IP FROM hmbbs WHERE logdate='${yesterday}';"
hive -e "CREATE TABLE hmbbs_jumper_${yesterday} AS SELECT COUNT(1) AS jumper FROM (SELECT COUNT(ip) AS times FROM hmbbs WHERE logdate='${yesterday}' GROUP BY ip HAVING times=1) e;"
hive -e "CREATE TABLE hmbbs_${yesterday} AS SELECT '${yesterday}', a.pv, b.reguser, c.ip, d.jumper FROM hmbbs_pv_${yesterday} a JOIN hmbbs_reguser_${yesterday} b ON 1=1 JOIN hmbbs_ip_${yesterday} c ON 1=1 JOIN hmbbs_jumper_${yesterday} d ON 1=1;"
#delete hive tables
hive -e "drop table hmbbs_pv_${yesterday};"
hive -e "drop table hmbbs_reguser_${yesterday};"
hive -e "drop table hmbbs_ip_${yesterday};"
hive -e "drop table hmbbs_jumper_${yesterday};"
#sqoop export to mysql
sqoop export --connect jdbc:mysql://hadoop0:3306/hmbbs --username root --password admin --table hmbbs_logs_stat --fields-terminated-by '\001' --export-dir '/user/hive/warehouse/hmbbs_${yesterday}'
#delete hive tables
hive -e "drop table hmbbs_${yesterday};"
以后每天都自动执行的脚本,把它配置到crontab中
vi hmbbs_daily.sh
#!/bin/sh
yesterday=`date --date='1 days ago' +%Y_%m_%d`
hmbbs_common.sh $yesterday
配置到crontab中定时执行
crontab -e
* 1 * * * /apache_logs/hmbbs_daily.sh