一、 inputSplit
InputSplit是指分片,在MapReduce当中作业中,作为map task最小输入单位。分片是基于文件基础上出来的而来的概念,通俗的理解一个文件可以切分为多少个片段,每个片段包括了<文件名,开始位置,长度,位于哪些主机>等信息。在MapTask拿到这些分片后,会知道从哪开始读取数据。
二、处理阶段
input->map->partitions->sort->combine(到这里是属于map task)->shuffle->reduce->output(这部分属于reduce task)
三、排序 [first,second]
package mr;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SortTest {
private static class MyNewKey implements WritableComparable<MyNewKey> {
long firstNum;
long secondNum;
public MyNewKey() {
}
public MyNewKey(long firstNum, long secondNum) {
super();
this.firstNum = firstNum;
this.secondNum = secondNum;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(firstNum);
out.writeLong(secondNum);
}
@Override
public void readFields(DataInput in) throws IOException {
firstNum = in.readLong();
secondNum = in.readLong();
}
/*
* 当key进行排序时会调用以下这个compreTo方法
*/
@Override
public int compareTo(MyNewKey anotherKey) {
System.out.println();
System.out.println("sorting!!!");
System.out.println("thisone:"+firstNum+","+secondNum+"anotherone:"+anotherKey.firstNum+","+anotherKey.secondNum);
long min = firstNum - anotherKey.firstNum;
if (min != 0) {
// 说明第一列不相等,则返回两数之间小的数
return (int) min;
} else {
return (int) (secondNum - anotherKey.secondNum);
}
}
}
public static class MyMapper extends Mapper<LongWritable, Text, MyNewKey, LongWritable> {
protected void map(LongWritable key, Text value,
Context context)
throws java.io.IOException, InterruptedException {
String[] spilted = value.toString().split(",");
long firstNum = Long.parseLong(spilted[0]);
long secondNum = Long.parseLong(spilted[1]);
// 使用新的类型作为key参与排序
MyNewKey newKey = new MyNewKey(firstNum, secondNum);
context.write(newKey, new LongWritable(secondNum));
System.out.println("mapping~");
}
}
public static class MyReducer extends Reducer<MyNewKey, LongWritable, LongWritable, LongWritable> {
protected void reduce(MyNewKey key, java.lang.Iterable<LongWritable> values,
Context context)
throws java.io.IOException, InterruptedException {
context.write(new LongWritable(key.firstNum), new LongWritable(key.secondNum));
System.out.println("reducing~");
}
}
private static String INPUT_PATH = "hdfs://master:9000/input/cp.txt";
private static String OUTPUT_PATH = "hdfs://master:9000/output/c";
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
try {
FileSystem fs=FileSystem.get(new URI(OUTPUT_PATH),conf);
if(fs.exists(new Path(OUTPUT_PATH)))
fs.delete(new Path(OUTPUT_PATH));
} catch (URISyntaxException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
Job job = new Job(conf, "myjob");
job.setJarByClass(SortTest.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
//job.setCombinerClass(MyReducer.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(LongWritable.class);
job.setMapOutputKeyClass(MyNewKey.class);
job.setMapOutputValueClass(LongWritable.class);
FileInputFormat.addInputPath(job, new Path(INPUT_PATH));
FileOutputFormat.setOutputPath(job, new Path(OUTPUT_PATH));
job.waitForCompletion(true);
}
}