mapreduce去重排序

去重(MapReduce)

创建三个类,分别为Mapper,Reduce,Launch。添加input数据(单一数据只含K)

说明:1.与通常的Mapper,Reduce,Launch一样只是有略微区别

​ 2.因数据只含K,所以Mapper输出值为Null

​ 3.因为数据只含K,所以Reduce输出Value不需要迭代(实现相同K覆盖去重的效果)

代码如下:

Mapper

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @ClassName SortMapper
 * @Desescription: TODO
 * @Author yangRui
 */
public class SortMapper extends Mapper<LongWritable, Text,Text, NullWritable> {

    @Override
    protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
        context.write(value, NullWritable.get());

    }
}

Reduce

import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;

/**
 * @ClassName SortReduce
 * @Desescription: TODO
 * @Author yangRui
 */
public class SortReduce extends Reducer<Text, NullWritable,Text, NullWritable> {
    @Override
    protected void reduce(Text key, Iterable<NullWritable> values,Context context) throws IOException, InterruptedException {
        context.write(key,NullWritable.get());
    }
}

Launch

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @ClassName SortLaunch
 * @Desescription: TODO
 * @Author yangRui
 */
public class SortLaunch {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
//       创建job对象
        Job job = Job.getInstance();
//        获取当前模块名
        job.setJobName("sort1");

//        设置map,reduce,Launch
        job.setMapperClass(SortMapper.class);
        job.setReducerClass(SortReduce.class);
        job.setJarByClass(SortLaunch.class);

//        设置map输出类型
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(NullWritable.class);

//        设置最终的输出类型
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(NullWritable.class);

//         获得配置,判断输出文件在指定位置是否存在
        FileSystem fs = FileSystem.get(job.getConfiguration());
        Path out= new Path("D:\\ideaProject\\hadoop\\hadoop_pro\\sort1\\output");
        if(fs.exists(out)){
            fs.delete(out,  true);
        }

//         设置文件读取路径,输出路径
        FileInputFormat.addInputPath(job, new Path("D:\\ideaProject\\hadoop\\hadoop_pro\\sort1\\input"));
        FileOutputFormat.setOutputPath(job ,out);

//        提交任务
        job.waitForCompletion(true);
    }
}

实现效果

效果前
请添加图片描述
效果后
请添加图片描述

排序(MapReduce)

创建:1.三个类,分别为Mapper,Reduce,Launch。

​ 2.工具类,KeyCompartor继承WritableComparator

​ 3.添加input数据(单一数据只含K)

说明:1.与通常的Mapper,Reduce,Launch一样只是有略微区别

​ 2.因数据只含K,所以Mapper输出值为Null

​ 3.因为数据只含K,所以Reduce输出Value需要迭代(防止相同K覆盖去重)

​ 4.Launch,添加一个set用于调用工具类实现排序,设置Reduce任务数量

​ 4.工具类,实现排序

Mapper

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**
 * @ClassName SortMapper
 * @Desescription: TODO
 * @Author yangRui
 */
public class SortMapper extends Mapper<LongWritable, Text,LongWritable, NullWritable> {

    @Override
    protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
        String line = value.toString();                                 //获取一行数据
        LongWritable outKey = new LongWritable(Long.parseLong(line));   //将String类型转换为Long
        context.write(outKey, NullWritable.get());
    }
}

Reduce

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;

/**
 * @ClassName SortReduce
 * @Desescription: TODO
 * @Author yangRui
 */
public class SortReduce extends Reducer<LongWritable, NullWritable,LongWritable, NullWritable> {

    @Override
    protected void reduce(LongWritable key, Iterable<NullWritable> values, Reducer<LongWritable, NullWritable, LongWritable, NullWritable>.Context context) throws IOException, InterruptedException {
//       迭代空value,有相同K不覆盖
        for(NullWritable item:values){
            context.write(key,NullWritable.get());
        }
    }
}

Launch

import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * @ClassName SortLaunch
 * @Desescription: TODO
 * @Author yangRui
 */
public class SortLaunch {
    public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
//       创建job对象
        Job job = Job.getInstance();
//        获取当前模块取别名
        job.setJobName("sort1");

//        设置map,reduce,Launch
        job.setMapperClass(SortMapper.class);
        job.setReducerClass(SortReduce.class);
        job.setJarByClass(SortLaunch.class);

//        设置map输出类型
        job.setMapOutputKeyClass(LongWritable.class);
        job.setMapOutputValueClass(NullWritable.class);

//        设置最终的输出类型
        job.setOutputKeyClass(LongWritable.class);
        job.setOutputValueClass(NullWritable.class);

//        调用工具类实现排序
        job.setSortComparatorClass(KeyCompartor.class);

//         获得配置,判断输出文件在指定位置是否存在
        FileSystem fs = FileSystem.get(job.getConfiguration());
        Path out= new Path("D:\\ideaProject\\hadoop\\hadoop_pro\\sort1\\output");
        if(fs.exists(out)){
            fs.delete(out,  true);
        }

//         设置文件读取路径,输出路径
        FileInputFormat.addInputPath(job, new Path("D:\\ideaProject\\hadoop\\hadoop_pro\\sort1\\input"));
        FileOutputFormat.setOutputPath(job ,out);

//        设置Reduce任务数量
        job.setNumReduceTasks(1);
//        提交任务
        job.waitForCompletion(true);
    }
}

KeyCompartor

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;

/**
 * @ClassName KeyCompartor
 * @Desescription: TODO
 * @Author yangRui
 */
public class KeyCompartor extends WritableComparator {
    public KeyCompartor() {
        super(LongWritable.class , true); //创建比较器对象,默认false不创建对象
    }
    /*
     * 0 =>相等
     * -1 =>小于
     * 1 =>大于
     * */
    @Override
    public int compare(WritableComparable a, WritableComparable b) {
//        比较K大小进行排序
        LongWritable left = (LongWritable) a;
        LongWritable right = (LongWritable) b;
        return -(left.compareTo(right));
    }
}

实现效果

实现前
请添加图片描述
实现后

请添加图片描述

报错解决

报错内容
在这里插入图片描述
解决方法
添加资源文件:log4j.properties放到resources资源文件下

log4j.rootLogger=INFO, stdout, file

log4j.appender.file=org.apache.log4j.RollingFileAppender
log4j.appender.file.File=/logs/myapp.log
log4j.appender.file.MaxFileSize=10MB
log4j.appender.file.MaxBackupIndex=10
log4j.appender.file.layout=org.apache.log4j.PatternLayout
log4j.appender.file.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} [%-5p] [%-30F:%-5L] - %m%n

log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.target=System.out
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=[%-5p] %d{yyyy-MM-dd HH:mm:ss,SSS} [%-20t] %c:%L %m%n
  • 21
    点赞
  • 22
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值