【MR】MapReduce 常见的排序
一,Hadoop默认的排序算法,只会针对key值进行排序,按照字典顺序排序。
直接上代码
Map端
package Hadoop.MR.sort;
import java.io.IOException;
/**
* 文本排序-map分组,在Hadoop默认的排序算法中,只会针对key值进行排序
* @author Young
* created on 2017-6-30
*/
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SortMapper extends Mapper<LongWritable, Text, Text, NullWritable> {
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, NullWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
String line = value.toString();//获取文本内容
context.write(new Text(line), NullWritable.get());
}
}
Reduce端
package Hadoop.MR.sort;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
/**
* 文本排序-reduce端排序,在Hadoop默认的排序算法中,只会针对key值进行排序
* @author Young
* created on 2017-6-30
*/
import org.apache.hadoop.mapreduce.Reducer;
public class SortReducer extends Reducer<Text, NullWritable, Text, NullWritable> {
@Override
protected void reduce(Text k2, Iterable<NullWritable> v2,
Reducer<Text, NullWritable, Text, NullWritable>.Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
context.write(k2, NullWritable.get());
}
}
驱动程序
package Hadoop.MR.sort;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 排序驱动程序
* @author Young
* @version 创建时间:2017年6月30日上午9:31:50
*/
public class SortDriver extends Configured implements Tool {
public int run(String[] arg0) throws Exception {
if (arg0.length != 2){
System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
// Configuration conf = new Configuration();
// Job job = new Job(getConf(),"Max Temperture");
Job job = Job.getInstance(getConf(), "Sort");
job.setJarByClass(getClass());
FileInputFormat.addInputPath(job, new Path(arg0[0]));
FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
job.setMapperClass(SortMapper.class);
job.setReducerClass(SortReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
int exitCode = ToolRunner.run(new SortDriver(), args);
System.exit(exitCode);
}
}
排序前后
二,自定义,先根据第一列排序,若相同则根据第二列排序,
自定义Bean,
package Hadoop.MR.mysort;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
/**
* 实现WritableComparable接口,重写compareTo进行排序
* @author Young
* @version 创建时间:2017年6月30日上午9:54:14
*/
public class SortBean implements WritableComparable<SortBean> {
private long firstNum;
private long secondNum;
public SortBean(){
}
public SortBean(long first,long second){
this.firstNum=first;
this.secondNum=second;
}
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
this.firstNum=in.readLong();
this.secondNum=in.readLong();
}
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeLong(firstNum);
out.writeLong(secondNum);
}
public int compareTo(SortBean o) {
// TODO Auto-generated method stub
//返回1则交换,-1则不交换。
if(this.firstNum==o.getFirstNum()){
return this.secondNum>o.getSecondNum() ? 1:-1;
}
else{
return this.firstNum>o.getFirstNum() ? 1:-1;
}
}
@Override
public String toString() {
// TODO Auto-generated method stub
return this.firstNum+" "+this.secondNum;
}
public long getFirstNum() {
return firstNum;
}
public void setFirstNum(long firstNum) {
this.firstNum = firstNum;
}
public long getSecondNum() {
return secondNum;
}
public void setSecondNum(long secondNum) {
this.secondNum = secondNum;
}
}
Map端
package Hadoop.MR.mysort;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* 自定排序map端
* @author Young
* @version 创建时间:2017年6月30日上午10:20:12
*/
public class MySortMapper extends Mapper<LongWritable, Text, SortBean,NullWritable> {
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text,
SortBean, NullWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
String line = value.toString();
String []num=line.split("\t");
long firstNum=Long.parseLong(num[0]);
long secondNum=Long.parseLong(num[1]);
SortBean bean = new SortBean(firstNum,secondNum);
context.write(bean, NullWritable.get());
}
}
Reduce端
package Hadoop.MR.mysort;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
/**
* 自定义排序reduce端
* @author Young
* @version 创建时间:2017年6月30日上午10:30:42
*/
public class MySortReducer extends Reducer<SortBean, NullWritable, SortBean, NullWritable> {
@Override
protected void reduce(SortBean k2, Iterable<NullWritable> v2,
Reducer<SortBean, NullWritable, SortBean, NullWritable>.Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
context.write(k2, NullWritable.get());
}
}
驱动程序
package Hadoop.MR.mysort;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 自定义排序驱动程序
* @author Young
* @version 创建时间:2017年6月30日上午10:38:23
*/
public class MySortDriver extends Configured implements Tool {
public int run(String[] arg0) throws Exception {
if (arg0.length != 2){
System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
// Configuration conf = new Configuration();
// Job job = new Job(getConf(),"Max Temperture");
Job job = Job.getInstance(getConf(), "MySort");
job.setJarByClass(getClass());
FileInputFormat.addInputPath(job, new Path(arg0[0]));
FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
job.setMapperClass(MySortMapper.class);
job.setReducerClass(MySortReducer.class);
job.setOutputKeyClass(SortBean.class);
job.setOutputValueClass(NullWritable.class);
return job.waitForCompletion(true)?0:1;
}
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
int exitCode = ToolRunner.run(new MySortDriver(), args);
System.exit(exitCode);
}
}
排序前后
三,求最值
Map端
package Hadoop.MR.max;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* 最大值Map阶段
* @author Young
* @version 创建时间:2017年9月5日下午3:47:02
*/
public class MaxMapper extends Mapper<LongWritable, Text, Text, DoubleWritable> {
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException {
String lines[]=value.toString().split("\t");
String account=lines[0];
double income=Double.parseDouble(lines[1]);
context.write(new Text(account), new DoubleWritable(income));
}
}
Reduce端
package Hadoop.MR.max;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* 最大值Reduce阶段
* @author Young
* @version 创建时间:2017年9月5日下午4:15:29
*/
public class MaxReducer extends Reducer<Text, DoubleWritable, Text, DoubleWritable> {
@Override
protected void reduce(Text key, Iterable<DoubleWritable> value,
Reducer<Text, DoubleWritable, Text, DoubleWritable>.Context context)
throws IOException, InterruptedException {
double Max=Double.MIN_VALUE;
for(DoubleWritable v:value){
Max=Math.max(Max, v.get());
}
context.write(key, new DoubleWritable(Max));
}
}
驱动程序
package Hadoop.MR.max;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 最大值驱动类
* @author Young
* @version 创建时间:2017年9月5日下午4:17:32
*/
public class MaxDriver extends Configured implements Tool {
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
int exitCode = ToolRunner.run(new MaxDriver(), args);
System.exit(exitCode);
}
public int run(String[] arg0) throws Exception {
if (arg0.length != 2){
System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
Job job = Job.getInstance(getConf(), "Max");
job.setJarByClass(getClass());
FileInputFormat.addInputPath(job, new Path(arg0[0]));
FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
job.setMapperClass(MaxMapper.class);
job.setReducerClass(MaxReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
return job.waitForCompletion(true)?0:1;
}
}
排序前后
四,TopN,文件中包含不同的key,键值不唯一,取每个键值前三个最小值。
Map端
package Hadoop.MR.topn;
import java.io.IOException;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* 取前n名Map阶段
* @author Young
* @version 创建时间:2017年9月10日下午3:26:00
*/
public class TopNMapper extends Mapper<LongWritable,Text,Text ,DoubleWritable> {
private Text k=new Text();
private DoubleWritable v=new DoubleWritable();
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text ,DoubleWritable>.Context context)
throws IOException, InterruptedException {
String lines[]=value.toString().split("\t");
String account=lines[0];
double income=Double.parseDouble(lines[1]);
v.set(income);
k.set(account);
context.write(k,v);
}
}
Reduce端
package Hadoop.MR.topn;
import java.io.IOException;
import java.util.TreeSet;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* 取前n名reduce阶段
* @author Young
* @version 创建时间:2017年9月10日下午3:49:01
*/
public class TopNReducer extends Reducer<Text ,DoubleWritable,Text ,DoubleWritable> {
private TreeSet<Double> incomeTreeSet = new TreeSet<Double>();
private DoubleWritable v=new DoubleWritable();
private int n=3;
@Override
protected void reduce( Text key, Iterable<DoubleWritable> value,
Reducer<Text ,DoubleWritable, Text ,DoubleWritable>.Context
context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
for (DoubleWritable val : value) {
incomeTreeSet.add(val.get());
if (incomeTreeSet.size() > n) {
incomeTreeSet.remove(incomeTreeSet.first());
}
}
for (Double in: incomeTreeSet) {
v.set(in);
context.write(key, v);
}
}
}
驱动程序
package Hadoop.MR.topn;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 取前n名驱动程序
* @author Young
* @version 创建时间:2017年9月10日下午3:59:49
*/
public class TopNDriver extends Configured implements Tool {
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
int exitCode = ToolRunner.run(new TopNDriver(), args);
System.exit(exitCode);
}
public int run(String[] arg0) throws Exception {
//TODO Auto-generated method stub
if (arg0.length != 2){
System.err.printf("Usage:%s[generic options]<input> <output>\n",getClass().getSimpleName());
ToolRunner.printGenericCommandUsage(System.err);
return -1;
}
Job job = Job.getInstance(getConf(), "TopN");
job.setJarByClass(getClass());
FileInputFormat.addInputPath(job, new Path(arg0[0]));
FileOutputFormat.setOutputPath(job, new Path(arg0[1]));
job.setMapperClass(TopNMapper.class);
job.setReducerClass(TopNReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
return job.waitForCompletion(true)?0:1;
}
}
结果
结果是有了,但是后来觉得这样会把Reduce给拖死,TreeSet的排序以及换位会消耗很多内存资源。想在Map就把排序做好,然后Reduce直接取前n就好了,不过没想到。有更好的算法的,请拍砖。