需要进行排序的文本:
a 1
a 9
b 3
a 7
b 8
b 10
a 5
a 9
主类
package com.wxj.sort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/*
* Created by wxj on 2019/8/7 0007 17:07
*/
public class SortMain extends Configured implements Tool {
@Override
public int run(String[] args) throws Exception {
//获取job类
Job job = Job.getInstance(super.getConf(), SortMain.class.getSimpleName());
//第一步读取文件
//设置读取格式
job.setInputFormatClass(TextInputFormat.class);
//设置读取路径
TextInputFormat.addInputPath(job,new Path("file:///E:***自己要排序文件目录***\\input"));
//第二步,设置mapper类输出k2,v2
job.setMapperClass(SortMapper.class);
job.setMapOutputKeyClass(PairSort.class);
job.setMapOutputValueClass(Text.class);
//分区
//排序
//规约
//设置reducer类
job.setReducerClass(SortReducer.class);
job.setOutputKeyClass(PairSort.class);
job.setOutputValueClass(NullWritable.class);
//数据输出
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job,new Path("file:///**自己的输出路径**\\mySortOut2"));
//提交任务
boolean b = job.waitForCompletion(true);
//
return b?0:1;
}
static {
try {
System.load("D:\\soft\\hadoop-2.6.0-cdh5.14.0\\bin\\hadoop.dll");
} catch (UnsatisfiedLinkError e) {
System.err.println("Native code library failed to load.\n" + e);
System.exit(1);
}
}
public static void main(String[] args) throws Exception {
int run = ToolRunner.run(new Configuration(), new SortMain(), args);
System.exit(run);
}
}
mapper类:
package com.wxj.sort;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/*
* Created by wxj on 2019/8/7 0007 17:08
*/
public class SortMapper extends Mapper<LongWritable, Text,PairSort,Text> {
//对k1,v1进行处理 k1是行偏移量,v1是一行文本
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
PairSort pairSort = new PairSort();
String[] split = value.toString().split("\t");//把一行数据用制表符tab分开
//为PairSort赋值
pairSort.setFirst(split[0]);
// System.out.println(split[1]);
pairSort.setSecond(Integer.parseInt(split[1]));
context.write(pairSort,value);
}
}
reducer类:
package com.wxj.sort;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
* Created by wxj on 2019/8/7 0007 17:08
*/
//reduce收到的值应该是 PairSort,Text,
public class SortReducer extends Reducer<PairSort, Text,PairSort, NullWritable> {
@Override
protected void reduce(PairSort key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
/*for (Text value:values) {
context.write(key,NullWritable.get());
}*/
context.write(key,NullWritable.get());
}
}
自定义对象实现排序:
package com.wxj.sort;
//import lombok.Data;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/*
* Created by wxj on 2019/8/7 0007 17:09
*/
//@Data
public class PairSort implements WritableComparable<PairSort> {
private String first;
private Integer second;
@Override
public String toString() {
return first+"\t"+second;
}
public String getFirst() {
return first;
}
public void setFirst(String first) {
this.first = first;
}
public Integer getSecond() {
return second;
}
public void setSecond(Integer second) {
this.second = second;
}
//自定义的对比方法
@Override
public int compareTo(PairSort o) {
//比较第一列数据
int i = this.first.compareTo(o.first);
if( i !=0){//如果第一个数相比较没有相同的话,
System.out.println("第一列对比返回结果为"+i);
return i;
}else{//如果第一列值比较相同,就开始比较第二列
int res = this.second.compareTo(o.second);
System.out.println("第二列比较结果为"+res);
return res;
}
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(first);
dataOutput.writeInt(second);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.first = dataInput.readUTF();
this.second = dataInput.readInt();
}
public static void main(String[] args) {
PairSort p1 =new PairSort();
PairSort p2 =new PairSort();
p1.setFirst("a");
p1.setSecond(1);
p2.setFirst("b");
p2.setSecond(2);
p1.compareTo(p2);
}
}
排序结果:
a 1
a 5
a 7
a 9
b 3
b 8
b 10
排序正常,但是少了一组 a 9
为了跟踪到底是在哪里把这组a 9给弄丢了,我在mapper类和reducer类中添加了点输出代码,mapper中执行结果如下:
可以看到我们的mapper类中已经把两个a 9,也就是数据减少的问题不在这
再看reducer类中的日志:
说明是在reducer中丢失的,在reducer中会把相同的key 合成一个同一个 value
在reduce中就会有一个 key value对为 ----key:(a,9) value:[(a 9),(a 9)]
验证迭代里确实是两个 a 9
后面争取使用源码的方式解释这个问题
if (this.hasMore) {
nextKey = this.input.getKey();
this.nextKeyIsSame = this.comparator.compare(this.currentRawKey.getBytes(), 0, this.currentRawKey.getLength(), nextKey.getData(), nextKey.getPosition(), nextKey.getLength() - nextKey.getPosition()) == 0;
} else {
this.nextKeyIsSame = false;
}
如果不想让一个a 9 丢失,需要修改reducer类
public class SortReducer extends Reducer<PairSort, Text,PairSort, NullWritable> {
Logger log = LoggerFactory.getLogger(SortReducer.class);
@Override
protected void reduce(PairSort key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
System.out.println("k2 key---->"+key);
/* if(values.iterator().hasNext()){
System.out.println( "v2 values----->"+values.iterator().next());
}
*/
// context.write(key,NullWritable.get());
for (Text value:values) {
context.write(key,NullWritable.get());
}
}
}
结果为:
a 1
a 5
a 7
a 9
a 9
b 3
b 8
b 10
在mapTask .class中 sortInternal 方法
会按照行偏移量对每行的值进行比较,r代表最大的行数(如示例为8)p为初始值为0
for(i = p; i < r; ++i) {
for(j = i; j > p && s.compare(j - 1, j) > 0; --j) {
s.swap(j, j - 1);
}
}
根据跟断点,在进入reduce前,会进行一个快速排序,把现在的文本进行初步排序,reduce还会根据我们定义的排序方法排序