日志流量分析

ETL的实现

 

ETL:Extract-Transform-Load,数据抽取-转换-加载过程。

■目 标:过滤无效数据,解析补全数据,格式化需求数据
■无效数据:缺少访客id,会话id,订单id等关键属性的,针对不同事件有不同,的属性要求
■解析补全数据: 浏览器信息、操作系统信息、地域信息等

■格式化数据:时间日期、客户端信息等
■数据来源: 存储在HDFS上的用户行为数据
■数据存储: ETL后的数据存储位置: HDFS,供后期使用Hive分析做好准备

 

数据清洗日志内容

ETL代码实现

 

WebLogDriver:

public class WebLogDriver extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        //1.创建Job
        Job job = Job.getInstance( this.getConf(), "WebLogDriver" );
        job.setJarByClass( WebLogDriver.class );
        //2.设置job
        //(2.1)input
        Path inputPath  = new Path( args[0] );
        FileInputFormat.setInputPaths( job,inputPath );
        //(2.2)map
        job.setMapperClass( WebLogMapper.class );
        job.setMapOutputKeyClass( Text.class );
        job.setMapOutputValueClass( NullWritable.class );


        //(2.5)output
        //如果输出目录存在,干掉
        FileSystem hdfs = FileSystem.get( this.getConf() );
        Path outputPath  = new Path( args[1] );
        if(hdfs.exists( outputPath )){
            hdfs.delete(  outputPath ,true);
        }
        FileOutputFormat.setOutputPath( job,outputPath );

        //3.提交运行
        boolean isSuccess = job.waitForCompletion( true );
        return isSuccess?0:1;
    }

    public static void main(String[] args) {
        Configuration configuration = new Configuration();
        //run(Configuration conf, Tool tool, String[] args)
        try {
            int status = ToolRunner.run( configuration, new WebLogDriver(), args );
            System.exit( status );
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

WebLogMapper:

public class WebLogMapper extends Mapper<LongWritable, Text,Text, NullWritable> {
    LogParseUtil logParseUtil =  new LogParseUtil();
    Set<String> page = new HashSet<String>(  );
    private Text outputKey = new Text(  );

    @Override
    protected void setup(Context context) throws IOException, InterruptedException {
        page.add("/about");
        page.add("/black-ip-list/");
        page.add("/cassandra-clustor/");
        page.add("/finance-rhive-repurchase/");
        page.add("/hadoop-family-roadmap/");
        page.add("/hadoop-hive-intro/");
        page.add("/hadoop-zookeeper-intro/");
        page.add("/hadoop-mahout-roadmap/");
    }

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        WebLogBean logBean = logParseUtil.logParseHandle( line );
        logParseUtil.pageFilter(logBean,page);
        outputKey.set( logBean.toString() );
        context.write( outputKey,NullWritable.get() );
    }
}

LogParseUtil:

public class LogParseUtil {
    SimpleDateFormat df1 = new SimpleDateFormat( "dd/MMM/yyyy:HH:mm:ss",Locale.ENGLISH);
    SimpleDateFormat df2 = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss", Locale.ENGLISH);
    public WebLogBean logParseHandle(String line){
        String[] items = line.split( " " );
        WebLogBean webLogBean = new WebLogBean();
        if(items.length > 11){
            if(StringUtils.isNotBlank( items[0] )) {
                webLogBean.setIp( items[0] );
            }
            webLogBean.setUser_id( items[2] );
            //时间格式化
            String s_time = dateDateFormat(items[3].substring( 1 ));
            if(s_time == null){
                s_time = "invalid_time";
                webLogBean.setValid( false );
            }
            webLogBean.setTime(  s_time );
            //URL
            webLogBean.setRequest( items[6] );
            webLogBean.setStatus( items[8] );
            if(Integer.parseInt( items[8]  ) >400) {
                webLogBean.setValid( false );
            }
            webLogBean.setBody_size( items[9] );
            webLogBean.setHttp_ref( items[10] );

            //UserAgent
            if(items.length > 12){
                StringBuffer sb = new StringBuffer(  );
                for (int i = 11; i < items.length; i++) {
                    sb.append( items[i] ).append( " " );
                }
                webLogBean.setUser_agent( sb.toString() );
            }else {
                webLogBean.setUser_agent( items[11] );
            }
        }else {
            webLogBean.setValid( false );
        }
        return webLogBean;
    }

    private String dateDateFormat(String substring) {
        String dataStr = null;
        Date date = null;
        try {
            date = df1.parse( substring );
            dataStr = df2.format( date );
        } catch (ParseException e) {
            e.printStackTrace();
        }

        return dataStr;
    }

    public void pageFilter(WebLogBean logBean, Set<String> page) {
        if(!page.contains( logBean.getRequest() )){
            logBean.setValid( false );
        }
    }
}

WebLogBean:

public class WebLogBean implements Writable {
    private boolean valid = true;
    private String  ip ;
    private String  user_id;
    //请求的时间   2018-07-07 16:59:59
    private String  time;
    //请求的URL
    private String  request;
    //请求结果的状态
    private String  status;
    private String  body_size;
    //前页面
    private String  http_ref;
    //浏览器,操作系统等信息
    private String  user_agent;

    public WebLogBean() {
    }

    public WebLogBean(boolean valid, String ip, String user_id, String time, String request, String status, String body_size, String http_ref, String user_agent) {
        this.valid = valid;
        this.ip = ip;
        this.user_id = user_id;
        this.time = time;
        this.request = request;
        this.status = status;
        this.body_size = body_size;
        this.http_ref = http_ref;
        this.user_agent = user_agent;
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeBoolean( valid );
        out.writeUTF( ip );
        out.writeUTF( user_id );
        out.writeUTF( time );
        out.writeUTF( request );
        out.writeUTF( status );
        out.writeUTF( body_size );
        out.writeUTF( http_ref );
        out.writeUTF( user_agent );
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.valid = in.readBoolean();
        this.ip = in.readUTF();
        this.user_id = in.readUTF();
        this.time = in.readUTF();
        this.request = in.readUTF();
        this.status = in.readUTF();
        this.body_size = in.readUTF();
        this.http_ref = in.readUTF();
        this.user_agent = in.readUTF();
    }


    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append( this.valid )
                .append( "\001" ).append( this.ip )
                .append( "\001" ).append( this.user_id )
                .append( "\001" ).append( this.time )
                .append( "\001" ).append( this.request )
                .append( "\001" ).append( this.status )
                .append( "\001" ).append( this.body_size )
                .append( "\001" ).append( this.http_ref )
                .append( "\001" ).append( this.user_agent );

        return sb.toString();
    }

    public boolean isValid() {
        return valid;
    }

    public void setValid(boolean valid) {
        this.valid = valid;
    }

    public String getIp() {
        return ip;
    }

    public void setIp(String ip) {
        this.ip = ip;
    }

    public String getUser_id() {
        return user_id;
    }

    public void setUser_id(String user_id) {
        this.user_id = user_id;
    }

    public String getTime() {
        return time;
    }

    public void setTime(String time) {
        this.time = time;
    }

    public String getRequest() {
        return request;
    }

    public void setRequest(String request) {
        this.request = request;
    }

    public String getStatus() {
        return status;
    }

    public void setStatus(String status) {
        this.status = status;
    }

    public String getBody_size() {
        return body_size;
    }

    public void setBody_size(String body_size) {
        this.body_size = body_size;
    }

    public String getHttp_ref() {
        return http_ref;
    }

    public void setHttp_ref(String http_ref) {
        this.http_ref = http_ref;
    }

    public String getUser_agent() {
        return user_agent;
    }

    public void setUser_agent(String user_agent) {
        this.user_agent = user_agent;
    }

打成jar包然后上传到hadoop运行:

bin/yarn jar hadoop-1.0-SNAPSHOT.jar com.huadian.webetl.etl.WebLogDriver /datas/tmp/access.log /flume/weblog/etl

一条数据: 

false1.162.203.134-2013-09-18 13:47:35/images/my.jpg20019939"http://www.angularjs.cn/A0d9""Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36" 

 

PageView分析模型构建


■对清洗后数据针对每一个会话,进行进一步分析,构建pageview模型, 在每一条记录
中添加session_ id、 停留时长、访问顺序等字段

步骤: 

Map:

key   value

ip 其他字段

reduce:

每一个IP调用一次Reduce方法

将所有的记录放到List

可以对所有的记录进行排序:访问时间排序

生成对应的字段

迭代整个List集合

如果list的长度为1,

直接输出结果

如果拿到的是第一条记录

i=0,跳过

循环到达第二个记录

i=1,

sessionId = RandomUUID.toString();

step=1

length =list[i].getTime() –list[i-1].getTime

判定:length >30分钟

sessionId重置

step=1

如果I = list.length-1

输出上一条和当前条

output

 

PageViewDriver:

public class PageViewDriver extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        //1.创建Job
        Job job = Job.getInstance( this.getConf(), "PageViewDriver" );
        job.setJarByClass( PageViewDriver.class );
        //2.设置job
        //(2.1)input
        Path inputPath  = new Path( args[0] );
        FileInputFormat.setInputPaths( job,inputPath );
        //(2.2)map
        job.setMapperClass( PageViewMapper.class );
        job.setMapOutputKeyClass( Text.class );
        job.setMapOutputValueClass( WebLogBean.class );
        //(2.4)reduce
        job.setReducerClass( PageViewReducer.class );
        job.setOutputKeyClass( PVBean.class );
        job.setOutputValueClass( NullWritable.class );
        //job.setNumReduceTasks( 2 );
        //(2.5)output
        //如果输出目录存在,干掉
        FileSystem hdfs = FileSystem.get( this.getConf() );
        Path outputPath  = new Path( args[1] );
        if(hdfs.exists( outputPath )){
            hdfs.delete(  outputPath ,true);
        }
        FileOutputFormat.setOutputPath( job,outputPath );

        //3.提交运行
        boolean isSuccess = job.waitForCompletion( true );
        return isSuccess?0:1;
    }
    public static void main(String[] args) {
        Configuration configuration = new Configuration();
        //run(Configuration conf, Tool tool, String[] args)
        try {
            int status = ToolRunner.run( configuration, new PageViewDriver(), args );
            System.exit( status );
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

PageViewMapper:

public class PageViewMapper extends Mapper<LongWritable, Text,Text, WebLogBean> {
    private Text outputKey = new Text(  );
    private WebLogBean outputValue = new WebLogBean();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws InterruptedException, IOException {
        String line = value.toString();
        String[] itmes = line.split( "\001" );
        outputValue.setAll( itmes[0].equals( "true" ),itmes[1],itmes[2],itmes[3],itmes[4],itmes[5],itmes[6],itmes[7],itmes[8]);
        outputKey.set( outputValue.getIp() );

        context.getCounter( "user define group ","ip count" ).increment( 1L );
        if(outputValue.isValid()){
            context.getCounter( "user define group ","valid count" ).increment( 1L );
            context.write( outputKey,outputValue );
        }else {
            context.getCounter( "user define group ","invalid count" ).increment( 1L );
        }
    }
}

PageViewReducer:

public class PageViewReducer extends Reducer<Text, WebLogBean, PVBean, NullWritable> {
    private  NullWritable outputValue = NullWritable.get();
    private SimpleDateFormat sdf = new SimpleDateFormat( "yyyy-MM-dd HH:mm:ss" );
    private PVBean pvBean = new PVBean();
    @Override
    protected void reduce(Text key, Iterable<WebLogBean> values, Context context) throws IOException, InterruptedException {
        ArrayList<WebLogBean> beans = new ArrayList<>();
        for (WebLogBean bean:values) {
            WebLogBean bean_copy = new WebLogBean();
            try {
                BeanUtils.copyProperties( bean_copy ,bean);
            } catch (IllegalAccessException e) {
                e.printStackTrace();
            } catch (InvocationTargetException e) {
                e.printStackTrace();
            }
            beans.add( bean_copy );
        }
        //排序
        Collections.sort( beans,new Comparator<WebLogBean>() {
            @Override
            public int compare(WebLogBean o1, WebLogBean o2) {
                Date date1 = myToDate(o1.getTime());
                Date date2 = myToDate(o2.getTime());
                return date1.compareTo( date2 );
            }
        }  );
        //构建2个字段:sessionId,停留时间,步数
        String sessionId = UUID.randomUUID().toString();
        int step = 1;
        if(beans.size() == 1){
            WebLogBean b = beans.get( 0 );
            pvBean.setAll( sessionId,b.getIp(),b.getTime(),b.getRequest(),step,"60",b.getHttp_ref(),b.getUser_agent(),b.getBody_size(),b.getStatus());
            context.write( pvBean,outputValue);
            return;
        }
        //有2条记录才会执行下面的代码
        for (int i = 0; i < beans.size(); i++) {
            if(i==0){
                continue;
            }
            long length = timeDiff(beans.get( i ).getTime(),beans.get( i-1 ).getTime())/1000;
            if(length < 30 * 60 ){
                //同一个session
                WebLogBean b = beans.get( i-1 );
                pvBean.setAll( sessionId,b.getIp(),b.getTime(),b.getRequest(),step,length +"",b.getHttp_ref(),
                        b.getUser_agent(),b.getBody_size(),b.getStatus());
                context.write( pvBean,outputValue);
                step++;
            }else {
                //横跨2个Session
                WebLogBean b = beans.get( i-1 );
                pvBean.setAll( sessionId,b.getIp(),b.getTime(),b.getRequest(),step,length +"",b.getHttp_ref(),
                        b.getUser_agent(),b.getBody_size(),b.getStatus());
                context.write( pvBean,outputValue);
                //重置
                sessionId =  UUID.randomUUID().toString();
                step =1;
            }
            if(i == beans.size() -1){
                WebLogBean b = beans.get( i );
                pvBean.setAll( sessionId,b.getIp(),b.getTime(),b.getRequest(),step,"60",b.getHttp_ref(),
                        b.getUser_agent(),b.getBody_size(),b.getStatus());
                context.write( pvBean,outputValue);
            }
        }
    }
    private long timeDiff(String time, String time1) {
        return this.myToDate( time ).getTime() - this.myToDate( time1 ).getTime();
    }
    private Date myToDate(String time) {
        Date date = null;
        try {
            date = sdf.parse( time );
        } catch (ParseException e) {
            e.printStackTrace();
        }
        return date;
    }
}

WebLogBean:

public class WebLogBean implements WritableComparable {
    private boolean valid = true;
    private String  ip ;
    private String  user_id;
    //请求的时间   2018-07-07 16:59:59
    private String  time;
    //请求的URL
    private String  request;
    //请求结果的状态
    private String  status;
    private String  body_size;
    //前页面
    private String  http_ref;
    //浏览器,操作系统等信息
    private String  user_agent;

    public WebLogBean() {
    }
    public void  setAll(boolean valid, String ip, String user_id, String time, String request, String status, String body_size, String http_ref, String user_agent) {
        this.valid = valid;
        this.ip = ip;
        this.user_id = user_id;
        this.time = time;
        this.request = request;
        this.status = status;
        this.body_size = body_size;
        this.http_ref = http_ref;
        this.user_agent = user_agent;
    }

    @Override
    public int compareTo(Object o) {
        return o.toString().compareTo( this.toString() );

    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeBoolean( valid );
        out.writeUTF( ip );
        out.writeUTF( user_id );
        out.writeUTF( time );
        out.writeUTF( request );
        out.writeUTF( status );
        out.writeUTF( body_size );
        out.writeUTF( http_ref );
        out.writeUTF( user_agent );
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.valid = in.readBoolean();
        this.ip = in.readUTF();
        this.user_id = in.readUTF();
        this.time = in.readUTF();
        this.request = in.readUTF();
        this.status = in.readUTF();
        this.body_size = in.readUTF();
        this.http_ref = in.readUTF();
        this.user_agent = in.readUTF();
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append( this.valid )
                .append( "\001" ).append( this.ip )
                .append( "\001" ).append( this.user_id )
                .append( "\001" ).append( this.time )
                .append( "\001" ).append( this.request )
                .append( "\001" ).append( this.status )
                .append( "\001" ).append( this.body_size )
                .append( "\001" ).append( this.http_ref )
                .append( "\001" ).append( this.user_agent );
        return sb.toString();
    }

    public boolean isValid() {
        return valid;
    }

    public void setValid(boolean valid) {
        this.valid = valid;
    }

    public String getIp() {
        return ip;
    }

    public void setIp(String ip) {
        this.ip = ip;
    }

    public String getUser_id() {
        return user_id;
    }

    public void setUser_id(String user_id) {
        this.user_id = user_id;
    }

    public String getTime() {
        return time;
    }

    public void setTime(String time) {
        this.time = time;
    }

    public String getRequest() {
        return request;
    }

    public void setRequest(String request) {
        this.request = request;
    }

    public String getStatus() {
        return status;
    }

    public void setStatus(String status) {
        this.status = status;
    }

    public String getBody_size() {
        return body_size;
    }

    public void setBody_size(String body_size) {
        this.body_size = body_size;
    }

    public String getHttp_ref() {
        return http_ref;
    }

    public void setHttp_ref(String http_ref) {
        this.http_ref = http_ref;
    }

    public String getUser_agent() {
        return user_agent;
    }

    public void setUser_agent(String user_agent) {
        this.user_agent = user_agent;
    }
}

 

PVBean:

public class PVBean implements Writable {
    private String session;
    private String ip;
    private String time;
    private String request;
    private int step;
    private String length;
    private String http_ref;
    private String user_agent;
    private String body_size;
    private String status;

    public void setAll(String session,String ip,String time,String request,int step,String length
            ,String http_ref,String user_agent,String body_size,String status){
        this.setSession(session);
        this.setIp(ip);
        this.setTime(time);
        this.setRequest(request);
        this.setStep(step);
        this.setLength(length);
        this.setHttp_ref(http_ref);
        this.setUser_agent(user_agent);
        this.setBody_size(body_size);
        this.setStatus(status);
    }
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF(session);
        out.writeUTF(ip);
        out.writeUTF(time);
        out.writeUTF(request);
        out.writeInt(step);
        out.writeUTF(length);
        out.writeUTF(http_ref);
        out.writeUTF(user_agent);
        out.writeUTF(body_size);
        out.writeUTF(status);
    }

    @Override
    public void readFields(DataInput in) throws IOException {
        this.session = in.readUTF();
        this.ip = in.readUTF();
        this.time = in.readUTF();
        this.request = in.readUTF();
        this.step = in.readInt();
        this.length = in.readUTF();
        this.http_ref = in.readUTF();
        this.user_agent = in.readUTF();
        this.body_size = in.readUTF();
        this.status = in.readUTF();
    }

    @Override
    public String toString() {
        StringBuilder sb = new StringBuilder();
        sb.append( session )
                .append( "\001" ).append( ip )
                .append( "\001" ).append( time )
                .append( "\001" ).append( request )
                .append( "\001" ).append( step )
                .append( "\001" ).append( length )
                .append( "\001" ).append( http_ref )
                .append( "\001" ).append( user_agent )
                .append( "\001" ).append( body_size )
                .append( "\001" ).append( status );

        return sb.toString();
    }

    public String getSession() {
        return session;
    }

    public void setSession(String session) {
        this.session = session;
    }

    public String getIp() {
        return ip;
    }

    public void setIp(String ip) {
        this.ip = ip;
    }

    public String getTime() {
        return time;
    }

    public void setTime(String time) {
        this.time = time;
    }

    public String getRequest() {
        return request;
    }

    public void setRequest(String request) {
        this.request = request;
    }

    public int getStep() {
        return step;
    }

    public void setStep(int step) {
        this.step = step;
    }

    public String getLength() {
        return length;
    }

    public void setLength(String length) {
        this.length = length;
    }

    public String getHttp_ref() {
        return http_ref;
    }

    public void setHttp_ref(String http_ref) {
        this.http_ref = http_ref;
    }

    public String getUser_agent() {
        return user_agent;
    }

    public void setUser_agent(String user_agent) {
        this.user_agent = user_agent;
    }

    public String getBody_size() {
        return body_size;
    }

    public void setBody_size(String body_size) {
        this.body_size = body_size;
    }

    public String getStatus() {
        return status;
    }

    public void setStatus(String status) {
        this.status = status;
    }
}
bin/yarn jar hadoop-1.0-SNAPSHOT.jar com.huadian.weblogAnalytic.pageview.PageViewDriver /flume/weblog/etl /flume/weblog/pv

pv统计在etl的数据的基础上再进行操作。

530e77bc-0ec5-4871-bfe3-35bcfc296cc9111.161.17.1042013-09-18 12:17:25/hadoop-hive-intro/160"http://blog.fens.me/series-hadoop-cloud/""Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.66 Safari/537.36" 14763200

Visit分析模型构建:


对清洗后数据针对每一个会话,进行分组,统计每个session的起始时间、结束时间、进入页面、离开页面、访问页数、来源页面等信息

 

VisitViewDriver:

public class VisitViewDriver extends Configured implements Tool {
    @Override
    public int run(String[] args) throws Exception {
        //1.创建Job
        Job job = Job.getInstance( this.getConf(), "VisitViewDriver" );
        job.setJarByClass( VisitViewDriver.class );
        //2.设置job
        //(2.1)input
        Path inputPath  = new Path( args[0] );
        FileInputFormat.setInputPaths( job,inputPath );
        //(2.2)map
        job.setMapperClass( VisitViewMapper.class );
        job.setMapOutputKeyClass( Text.class );
        job.setMapOutputValueClass( PVBean.class );


        //(2.4)reduce
        job.setReducerClass( VisitViewReducer.class );
        job.setOutputKeyClass( VisitBean.class );
        job.setOutputValueClass( NullWritable.class );
        //job.setNumReduceTasks( 2 );
        //(2.5)output
        //如果输出目录存在,干掉
        FileSystem hdfs = FileSystem.get( this.getConf() );
        Path outputPath  = new Path( args[1] );
        if(hdfs.exists( outputPath )){
            hdfs.delete(  outputPath ,true);
        }
        FileOutputFormat.setOutputPath( job,outputPath );

        //3.提交运行
        boolean isSuccess = job.waitForCompletion( true );
        return isSuccess?0:1;
    }
    public static void main(String[] args) {
        Configuration configuration = new Configuration();
        //run(Configuration conf, Tool tool, String[] args)
        try {
            int status = ToolRunner.run( configuration, new VisitViewDriver(), args );
            System.exit( status );
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

VisitViewMapper:

public class VisitViewMapper extends Mapper<LongWritable, Text,Text,PVBean> {
    private Text outputKey = new Text(  );
    private PVBean outputValue = new PVBean();

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString();
        String[] items = line.split( "\001" );
        outputKey.set( items[0] );
        outputValue.setAll( items[0],items[1],items[2],items[3],Integer.parseInt( items[4] ),items[5],items[6],items[7],items[8],items[9] );
        context.write( outputKey,outputValue );
    }
}

VisitViewReducer:

public class VisitViewReducer extends Reducer<Text, PVBean, VisitBean, NullWritable> {
    private VisitBean outputKey = new VisitBean();
    private NullWritable outputValue = NullWritable.get();
    @Override
    protected void reduce(Text key, Iterable<PVBean> values, Context context) throws IOException, InterruptedException {
        ArrayList<PVBean> beans = new ArrayList<>();
        for (PVBean bean:values) {
            PVBean bean_copy = new PVBean();
            try {
                BeanUtils.copyProperties(bean_copy, bean);
            } catch (IllegalAccessException e) {
                e.printStackTrace();
            } catch (InvocationTargetException e) {
                e.printStackTrace();
            }
            beans.add(bean_copy);
        }
        //排序
        Collections.sort( beans,new Comparator<PVBean>() {
            @Override
            public int compare(PVBean o1, PVBean o2) {
                return o1.getStep() > o2.getStep() ?1:-1;
            }
        }  );
        outputKey.setSession( key.toString() );
        outputKey.setInTime( beans.get( 0 ).getTime() );
        outputKey.setInPage( beans.get( 0 ).getRequest() );
        outputKey.setOutTime( beans.get( beans.size()-1 ).getTime() );
        outputKey.setOutPage( beans.get( beans.size()-1 ).getRequest() );
        outputKey.setPageNum( beans.size() );
        outputKey.setHttp_ref(  beans.get( 0 ).getHttp_ref() );
        outputKey.setIp( beans.get( 0 ).getIp() );
        context.write( outputKey,outputValue );
    }
}

VisitBean

public class VisitBean implements Writable {
    private String session;
    private String inTime;
    private String outTime;
    private String inPage;
    private String outPage;
    private int pageNum;
    private String ip;
    private String http_ref;
    @Override
    public void write(DataOutput out) throws IOException {
        out.writeUTF( session );
        out.writeUTF( inTime );
        out.writeUTF( outTime );
        out.writeUTF( inPage );
        out.writeUTF( outPage );
        out.writeUTF( ip );
        out.writeUTF( http_ref );
        out.writeInt( pageNum );
    }
    @Override
    public void readFields(DataInput in) throws IOException {
        this.session = in.readUTF();
        this.inTime = in.readUTF();
        this.outTime = in.readUTF();
        this.inPage = in.readUTF();
        this.outPage = in.readUTF();
        this.ip = in.readUTF();
        this.http_ref = in.readUTF();
        this.pageNum = in.readInt();
    }
    @Override
    public String toString() {
        return session + "\001" + ip + "\001" + inTime + "\001" + outTime + "\001" + inPage
                + "\001" + outPage + "\001" + http_ref + "\001" + pageNum;
    }
    public String getSession() {
        return session;
    }
    public void setSession(String session) {
        this.session = session;
    }
    public String getInTime() {
        return inTime;
    }
    public void setInTime(String inTime) {
        this.inTime = inTime;
    }
    public String getOutTime() {
        return outTime;
    }
    public void setOutTime(String outTime) {
        this.outTime = outTime;
    }
    public String getInPage() {
        return inPage;
    }
    public void setInPage(String inPage) {
        this.inPage = inPage;
    }
    public String getOutPage() {
        return outPage;
    }
    public void setOutPage(String outPage) {
        this.outPage = outPage;
    }
    public int getPageNum() {
        return pageNum;
    }
    public void setPageNum(int pageNum) {
        this.pageNum = pageNum;
    }
    public String getIp() {
        return ip;
    }
    public void setIp(String ip) {
        this.ip = ip;
    }
    public String getHttp_ref() {
        return http_ref;
    }
    public void setHttp_ref(String http_ref) {
        this.http_ref = http_ref;
    }
}
bin/yarn jar hadoop-1.0-SNAPSHOT.jar com.huadian.weblogAnalytic.visitview.VisitViewDriver /flume/weblog/pv /flume/weblog/vi 

分析后的数据:

022d82f3-7b70-43ce-bbd9-6e0eaadc420271.206.247.972013-09-19 03:39:572013-09-19 03:39:57/hadoop-mahout-roadmap//hadoop-mahout-roadmap/"http://f.dataguru.cn/thread-175501-1-1.html"1

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值