基于论坛的apache common日志分析项目

项目托管于 Github,欢迎Star | Fork 并留言互相学习 基于论坛的apache common日志分析项目

项目描述

通过对技术论坛的apache common日志进行分析,计算论坛关键指标,供运营者决策。

项目设计

  • MapReduce程序计算KPI
  • HBASE详单查询
  • HIVE数据仓库多维分析

    日志报告分析

开发步骤:

1. 使用flume把日志数据导入到hdfs中

技术:flume(源是文件夹,目的是hdfs和hbase,管道是文件)
flume-hdfs.conf

# Name the components on this agent
a1.sources = r1
a1.sinks = k1
a1.channels = c1

# Describe/configure the source
a1.sources.r1.type = spooldir
a1.sources.r1.spoolDir = /home/elon/log

a1.sources.r1.fileHeader = true

# Describe the sink
a1.sinks.k1.type = hdfs
a1.sinks.k1.hdfs.path = hdfs://hadoop:9000/log/
a1.sinks.k1.hdfs.filePrefix = access-%Y-%m-%d
a1.sinks.k1.hdfs.round = true
a1.sinks.k1.hdfs.roundValue = 10
a1.sinks.k1.hdfs.roundUnit = minute
a1.sinks.k1.hdfs.useLocalTimeStamp = true
# Use a channel which buffers events in memory
a1.channels.c1.type = memory
a1.channels.c1.capacity = 1000
a1.channels.c1.transactionCapacity = 100

# Bind the source and sink to the channel
a1.sources.r1.channels = c1
a1.sinks.k1.channel = c1

2. 对数据进行清洗

技术:mapreduce
package com.elon33.bbs;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Locale;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;

public class bbsCleaner extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        final Job job = new Job(new Configuration(), bbsCleaner.class.getSimpleName());
        job.setJarByClass(bbsCleaner.class);
        FileInputFormat.setInputPaths(job, args[0]);
        job.setMapperClass(MyMapper.class);
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setReducerClass(MyReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);
        FileOutputFormat.setOutputPath(job, new Path(args[1]));
        job.waitForCompletion(true);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new bbsCleaner(), args);
    }

    static class MyMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
        LogParser logParser = new LogParser();
        Text v2 = new Text();

        protected void map(LongWritable key, Text value,
                org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, LongWritable, Text>.Context context)
                        throws java.io.IOException, InterruptedException {
            final String[] parsed = logParser.parse(value.toString());

            // 过滤掉静态信息
            if (parsed[2].startsWith("GET /static/") || parsed[2].startsWith("GET /uc_server")) {
                return;
            }

            // 过掉开头的特定格式字符串
            if (parsed[2].startsWith("GET /")) {
                parsed[2] = parsed[2].substring("GET /".length());
            } else if (parsed[2].startsWith("POST /")) {
                parsed[2] = parsed[2].substring("POST /".length());
            }

            // 过滤结尾的特定格式字符串
            if (parsed[2].endsWith(" HTTP/1.1")) {
                parsed[2] = parsed[2].substring(0, parsed[2].length() - " HTTP/1.1".length());
            }

            v2.set(parsed[0] + "\t" + parsed[1] + "\t" + parsed[2]);
            context.write(key, v2);
        };
    }

    static class MyReducer extends Reducer<LongWritable, Text, Text, NullWritable> {
        protected void reduce(LongWritable k2, java.lang.Iterable<Text> v2s,
                org.apache.hadoop.mapreduce.Reducer<LongWritable, Text, Text, NullWritable>.Context context)
                        throws java.io.IOException, InterruptedException {
            for (Text v2 : v2s) {
                context.write(v2, NullWritable.get());
            }
        };
    }

    static class LogParser {
        public static final SimpleDateFormat FORMAT = new SimpleDateFormat("d/MMM/yyyy:HH:mm:ss", Locale.ENGLISH);
        public static final SimpleDateFormat dateformat1 = new SimpleDateFormat("yyyyMMddHHmmss");

        public static void main(String[] args) throws ParseException {
            final String S1 = "27.19.74.143 - - [30/May/2013:17:38:20 +0800] \"GET /static/image/common/faq.gif HTTP/1.1\" 200 1127";
            LogParser parser = new LogParser();
            final String[] array = parser.parse(S1);
            System.out.println("样例数据: " + S1);
            System.out.format("解析结果:  ip=%s, time=%s, url=%s, status=%s, traffic=%s", array[0], array[1], array[2],
                    array[3], array[4]);
        }

        /**
         * 解析英文时间字符串
         * 
         * @param string
         * @return
         * @throws ParseException
         */
        private Date parseDateFormat(String string) {
            Date parse = null;
            try {
                parse = FORMAT.parse(string);
            } catch (ParseException e) {
                e.printStackTrace();
            }
            return parse;
        }

        /**
         * 解析日志的行记录
         * 
         * @param line
         * @return 数组含有5个元素,分别是ip、时间、url、状态、流量
         */
        public String[] parse(String line) {
            String ip = parseIP(line);
            String time = parseTime(line);
            String url = parseURL(line);
            String status = parseStatus(line);
            String traffic = parseTraffic(line);

            return new String[] { ip, time, url, status, traffic };
        }

        private String parseTraffic(String line) {
            final String trim = line.substring(line.lastIndexOf("\"") + 1).trim();
            String traffic = trim.split(" ")[1];
            return traffic;
        }

        private String parseStatus(String line) {
            final String trim = line.substring(line.lastIndexOf("\"") + 1).trim();
            String status = trim.split(" ")[0];
            return status;
        }

        private String parseURL(String line) {
            final int first = line.indexOf("\"");
            final int last = line.lastIndexOf("\"");
            String url = line.substring(first + 1, last);
            return url;
        }

        private String parseTime(String line) {
            final int first = line.indexOf("[");
            final int last = line.indexOf("+0800]");
            String time = line.substring(first + 1, last).trim();
            Date date = parseDateFormat(time);
            return dateformat1.format(date);
        }

        private String parseIP(String line) {
            String ip = line.split("- -")[0].trim();
            return ip;
        }
    }

}

数据清洗结果

hadoop fs -cat /user/elon/bbs_cleaned/2013_05_30/part-r-00000

3. 明细日志使用hbase存储,能够利用ip、时间查询

技术:设计表、预分区
在HBase中创建bbs_log表,其中包含一个列族cf
HBase=> create 'bbs_log','cf'
package com.elon33.bbs;

import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.Date;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.hbase.client.Put;
import org.apache.hadoop.hbase.mapreduce.TableOutputFormat;
import org.apache.hadoop.hbase.mapreduce.TableReducer;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;


public class bbsHBase extends Configured implements Tool {

    @Override
    public int run(String[] args) throws Exception {
        final Configuration configuration = new Configuration();
        // 设置zookeeper
        configuration.set("hbase.zookeeper.quorum", "hadoop");
        // 设置hbase表名称
        configuration.set(TableOutputFormat.OUTPUT_TABLE, "bbs_log");// 先在shell下创建一个表:create
        // 将该值改大,防止hbase超时退出
        configuration.set("dfs.socket.timeout", "180000");

        final Job job = new Job(configuration, "bbsHBaseBatchImport");
        job.setJarByClass(bbsHBase.class);  //是否打jar包运行
        job.setMapperClass(MyMapper.class);
        job.setReducerClass(MyReducer.class);

        // 设置map的输出类型,不设置reduce的输出类型
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(Text.class);
        job.setInputFormatClass(TextInputFormat.class);

        // 不再设置输出路径,而是设置输出格式类型
        job.setOutputFormatClass(TableOutputFormat.class);

        FileInputFormat.setInputPaths(job, args[0]);  // 设置输入文件为mapreduce中已经清洗过的文件
        job.waitForCompletion(true);
        return 0;
    }

    public static void main(String[] args) throws Exception {
        ToolRunner.run(new bbsHBase(), args);
    }

    static class MyMapper extends Mapper<LongWritable, Text, LongWritable, Text> {
        Text v2 = new Text();
        public static final SimpleDateFormat dateformat = new SimpleDateFormat("yyyyMMddHHmmss");
        public static final SimpleDateFormat dateformat1 = new SimpleDateFormat("yyyyMMdd");

        protected void map(LongWritable key, Text value,
                org.apache.hadoop.mapreduce.Mapper<LongWritable, Text, LongWritable, Text>.Context context)
                        throws java.io.IOException, InterruptedException {
            final String[] parsed = value.toString().split("\t");
            if(parsed.length==3){
                Date parseDate = null;
                String time1 = "";
                try {
                    parseDate = dateformat.parse(parsed[1]);
                    time1 = dateformat1.format(parseDate);
                } catch (ParseException e) {
                    e.printStackTrace();
                }
                String rowKey = parsed[0] + ":" + time1;// 设置行键:ip+time(只保留日期,去除时分秒)
                v2.set(rowKey + "\t" + parsed[0] + "\t" + parsed[1] + "\t" + parsed[2]); // ip+time ip  time  url
                context.write(key, v2);
            }else{
                return;
            }
        };
    }

    // 数据按行键和值 存入HBase中
    static class MyReducer extends TableReducer<LongWritable, Text, NullWritable> {
        protected void reduce(LongWritable k2, java.lang.Iterable<Text> v2s, Context context)
                throws java.io.IOException, InterruptedException {
            for (Text v2 : v2s) {
                final String[] splited = v2.toString().split("\t");
                final Put put = new Put(Bytes.toBytes(splited[0])); // 第一列行键
                put.add(Bytes.toBytes("cf"), Bytes.toBytes("date"), Bytes.toBytes(splited[1])); // 第二列IP
                put.add(Bytes.toBytes("cf"), Bytes.toBytes("time"), Bytes.toBytes(splited[2])); // 第三列time
                put.add(Bytes.toBytes("cf"), Bytes.toBytes("url"), Bytes.toBytes(splited[3])); // 第四列url
                context.write(NullWritable.get(), put);
            }
        };
    }
}

HBase中 bbs_log表存储结果

4. 使用hive进行数据的多维分析

技术:hive(表、视图)、自定义函数
# 存放数据的主分区表
hive -e "ALTER TABLE bbs ADD PARTITION(logdate='2013_05_30') LOCATION 'hdfs://hadoop:9000/user/elon/bbs_cleaned/2013_05_30';"
# create hive table everyday

## 统计单日PV数
hive -e "CREATE TABLE bbs_pv_2013_05_30 AS SELECT COUNT(1) AS PV FROM bbs WHERE logdate='2013_05_30';"

## 统计单日注册数
hive -e "CREATE TABLE bbs_reguser_2013_05_30 AS SELECT COUNT(1) AS REGUSER FROM bbs WHERE logdate='2013_05_30' AND INSTR(url,'member.php?mod=register')>0;"

## 统计单日访问IP用户数
hive -e "CREATE TABLE bbs_ip_2013_05_30 AS SELECT COUNT(DISTINCT ip) AS IP FROM bbs WHERE logdate='2013_05_30';"

## 统计单日跳出数
hive -e "CREATE TABLE bbs_jumper_2013_05_30 AS SELECT COUNT(1) AS jumper FROM (SELECT COUNT(ip) AS times FROM bbs WHERE logdate='2013_05_30' GROUP BY ip HAVING times=1) e;"

## 以上四个结果汇总到一张表统计
hive -e "CREATE TABLE bbs_2013_05_30 AS SELECT '2013_05_30', a.pv, b.reguser, c.ip, d.jumper FROM bbs_pv_2013_05_30 a JOIN bbs_reguser_2013_05_30 b ON 1=1 JOIN bbs_ip_2013_05_30 c ON 1=1 JOIN bbs_jumper_2013_05_30 d ON 1=1;"

汇总表结果

5. 把hive分析结果使用sqoop导出到mysql中

技术:sqoop、MySQL

创建bbs表

sqoop export --connect jdbc:mysql://hadoop:3306/bbs --username root --password 123456 --table bbs_logs --fields-terminated-by '\001' --export-dir 'hdfs://hadoop:9000/user/hive/warehouse/bbs_2013_05_30'

当最终分析的论坛指标数据导出到MySQL中时,之前那些临时表就可以删除了。在下面的自动调度中,实现临时表删除。

6. 最后,使用linux的crontab做自动调度

要想通过脚本实现每天自动调度进行日志分析,就必须用到shell脚本,将命令都封装在shell脚本中,通过每天日期的迭代和定时任务设置,实现自动调度分析日志。

crontab -e 中实现定时任务设置

* 1 * * * bbs_daily.sh

bbs_daily.sh 脚本中实现任务调度
bbs_daily.sh

#!/bin/sh

yesterday=`date --date='1 days ago' +%Y_%m_%d`
hmbbs_common.sh $yesterday

bbs_common.sh 中通过通用命令脚本实现日志分析过程,得到分析指标结果
bbs_common.sh

#!/bin/sh

#get yesterday format string
#yesterday=`date --date='1 days ago' +%Y_%m_%d`
yesterday=$1

#upload logs to hdfs
hadoop fs -put /apache_logs/access_${yesterday}.log  /bbs_logs

#cleaning data
hadoop jar /apache_logs/cleaned.jar  /bbs_logs/access_${yesterday}.log  /bbs_cleaned/${yesterday}  1>/dev/null


#alter hive table and then add partition to existed table
hive -e "ALTER TABLE bbs ADD PARTITION(logdate='${yesterday}') LOCATION '/bbs_cleaned/${yesterday}';"

#create hive table everyday
hive -e "CREATE TABLE bbs_pv_${yesterday} AS SELECT COUNT(1) AS PV FROM bbs WHERE logdate='${yesterday}';"
hive -e "CREATE TABLE bbs_reguser_${yesterday} AS SELECT COUNT(1) AS REGUSER FROM bbs WHERE logdate='${yesterday}' AND INSTR(url,'member.php?mod=register')>0;"
hive -e "CREATE TABLE bbs_ip_${yesterday} AS SELECT COUNT(DISTINCT ip) AS IP FROM bbs WHERE logdate='${yesterday}';"
hive -e "CREATE TABLE bbs_jumper_${yesterday} AS SELECT COUNT(1) AS jumper FROM (SELECT COUNT(ip) AS times FROM bbs WHERE logdate='${yesterday}' GROUP BY ip HAVING times=1) e;"
hive -e "CREATE TABLE bbs_${yesterday} AS SELECT '${yesterday}', a.pv, b.reguser, c.ip, d.jumper FROM bbs_pv_${yesterday} a JOIN bbs_reguser_${yesterday} b ON 1=1 JOIN bbs_ip_${yesterday} c ON 1=1 JOIN bbs_jumper_${yesterday} d ON 1=1;"

#delete hive tables
hive -e "drop table bbs_pv_${yesterday};"
hive -e "drop table bbs_reguser_${yesterday};"
hive -e "drop table bbs_ip_${yesterday};"
hive -e "drop table bbs_jumper_${yesterday};"


#sqoop export to mysql
sqoop export --connect jdbc:mysql://hadoop0:3306/bbs --username root --password admin --table bbs_logs --fields-terminated-by '\001' --export-dir '/user/hive/warehouse/bbs_${yesterday}'

#delete hive tables
hive -e "drop table bbs_${yesterday};"
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 5
    评论
评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值