用自定义InputFormat做万行数据清洗

1)需求:

去除日志中按照空格切分,字段长度小于等于11的日志属于不符合日志。

日志模板清洗前:一共14619行

 

2)实现代码:

1)编写LogMapper

 

package com.itstar.mr.wc0908.mr.bigdata_13.clean;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @author Jackson 
 * 2019-9-05
 */
public class LogCleanMap extends Mapper<LongWritable, Text,Text, NullWritable> {

    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1.读取一行数据
        String line = value.toString();
        //2.进行数据切分
        String[] word = line.split(" ");
        //3.判断是否符合+计数器
            if (word.length>11){
                //输出的都是具体的,所以NullWritable后面要加get获取空值
                context.write(value,NullWritable.get());


            }

    }
}

 

3)driver类:

 

package com.itstar.mr.wc0908.mr.bigdata_13.clean;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import javax.xml.soap.Text;
import java.io.IOException;

public class LogCleanDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        args=new String[]{"D:\\input\\test\\plus\\web.txt","D:\\input\\test\\plus\\outweb.txt"};

        Job job = Job.getInstance(new Configuration());

        job.setJarByClass(LogCleanDriver.class);

        //配置mapper的类及输出数据类型
        job.setMapperClass(LogCleanMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        //设置reduce的个数,默认是1
        job.setNumReduceTasks(0);

        //输入数据路径
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        //输出数据路径
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //提交任务
        job.waitForCompletion(true);
    }
}

 

结果:

日志模板清洗后:一共113770行

 

 

优化,计数器来,找出正常数据多少  和  非正常数据多少

做了一个封装:

 

map类

package com.itstar.mr.wc0908.mr.bigdata_13.clean;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @author Jackson
 * 2019-11-05
 */
public class LogCleanMap extends Mapper<LongWritable, Text,Text, NullWritable> {

    @Override
         protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        //1.读取一行数据
        String line = value.toString();

        //2.parseLog方到里面,是为了获取数据line
        boolean b = parseLog(line, context);

        //如果b为true ,则!b为假,不返回
        if (!b){
            return;
        }
        context.write(value,NullWritable.get());

    }

    public boolean parseLog(String line, Context context){

        //2.获取数据,进行数据切分
        String[] word = line.split(" ");

        //3.判断是否符合+计数器(map自带的计数器increment方法)
        if (word.length>11){
            //getCounter 设置名字的参数
            context.getCounter("map","正常数据").increment(1);
            return true;
        }else {
            context.getCounter("map","不符合数据").increment(1);
            return false;
        }
    }
}

 

drive类:

 

package com.itstar.mr.wc0908.mr.bigdata_13.clean;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import javax.xml.soap.Text;
import java.io.IOException;

public class LogCleanDriver {
    public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
        args=new String[]{"D:\\input\\test\\plus\\web.txt","D:\\input\\test\\plus\\outweb4.txt"};

        Job job = Job.getInstance(new Configuration());

        job.setJarByClass(LogCleanDriver.class);

        //配置mapper的类及输出数据类型
        job.setMapperClass(LogCleanMap.class);
        job.setMapOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

        //设置reduce的个数,默认是1
        job.setNumReduceTasks(0);

        //输入数据路径
        FileInputFormat.setInputPaths(job,new Path(args[0]));
        //输出数据路径
        FileOutputFormat.setOutputPath(job,new Path(args[1]));

        //提交任务
        job.waitForCompletion(true);
    }
}

 

结果:

 

 

 

 

 

 

 

 

 

 

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

SuperBigData~

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值