复杂的文件清洗

LogBean类

java
package clean;

import lombok.Getter;
import lombok.Setter;
    @Setter
    @Getter

public class LogBean {
    private String remote_addr;// 记录客户端的ip地址
    private String remote_user;// 记录客户端用户名称,忽略属性"-"
    private String time_local;// 记录访问时间与时区
    private String request;// 记录请求的url与http协议
    private String status;// 记录请求状态;成功是200
    private String body_bytes_sent;// 记录发送给客户端文件主体内容大小
    private String http_referer;// 用来记录从那个页面链接访问过来的
    private String http_user_agent;// 记录客户浏览器的相关信息

    private boolean valid = true;// 判断数据是否合法


    public  String toString(){
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append(this.valid);
        stringBuffer.append("\001").append(remote_addr);
        stringBuffer.append("\001").append(remote_user);
        stringBuffer.append("\001").append(time_local);
        stringBuffer.append("\001").append(request);
        stringBuffer.append("\001").append(status);
        stringBuffer.append("\001").append(body_bytes_sent);
        stringBuffer.append("\001").append(http_referer);
        stringBuffer.append("\001").append(http_user_agent);

        return stringBuffer.toString();
    }
}

LogCheanMapper

java
package clean;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

public class LogCleanMapper extends Mapper<LongWritable, Text, NullWritable,Text> {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //获取数据
        String line = value.toString();
        //判断文 字是否合法
        LogBean logBean=pressLog(line);
        if (!logBean.isValid()){
            return;
        }
        context.write(NullWritable.get(),new Text(logBean.toString()));
    }

    public LogBean pressLog(String line) {
        LogBean logBean = new LogBean();
        //切分
        String[] s = line.split(" ");
        if (s.length>11){
            //封装数据
            logBean.setRemote_addr(s[0]);
            logBean.setRemote_user(s[1]);
            logBean.setTime_local(s[3]+s[4]);
            logBean.setRequest(s[7]);
            logBean.setStatus(s[8]);
            logBean.setBody_bytes_sent(s[9]);
            logBean.setHttp_referer(s[10]);
            if (s.length>12){
                logBean.setHttp_user_agent(s[1]+""+s[12]);
            }else {
                logBean.setHttp_user_agent(s[11]);
            }
            //判断状态码大于或等 于400将valid=false
            if (Integer.parseInt (logBean.getStatus())>=400){
                logBean.setValid(false);
            }
        }else {
            logBean.setValid(false);
        }

        return  logBean;
    }
}

LogDriver

java
package clean;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import telephone.LogCleanDriver;
import telephone.LogCleanMapper;

import java.io.IOException;

public class LogDriver {
        public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
            args=new String[]{"G:\\a\\web.txt","G:\\a\\OutWeb3.txt"};

            Job job = Job.getInstance(new Configuration());
            job.setJarByClass(LogDriver.class);

            //配置mapper类及输出数据类型
            job.setMapperClass(LogCleanMapper.class);
            job.setOutputKeyClass(NullWritable.class);
            job.setOutputValueClass(Text.class);

            //设置reduce的个数,默认是1
            job.setNumReduceTasks(0);

            //输入数据路径
            FileInputFormat.setInputPaths(job,new Path(args[0]));
            //输出数据路径
            FileOutputFormat.setOutputPath(job,new Path(args[1]));
            //提交 任务
            job.waitForCompletion(true);

        }
    }

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值