1.部分排序
MapReduce默认就是在每个分区里进行排序
2.完全排序
在所有的分区中,整体有序
1)使用一个reduce
2)自定义分区函数
不同的key进入的到不同的分区之中,在每个分区中自动排序,实现完全分区..
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class PassPartition extends Partitioner<Text, IntWritable> {
@Override
public int getPartition(Text text, IntWritable intWritable, int numPartitions) {
String key = text.toString();
if (key.compareTo("xxx") < 0) {
return 0;
}
if (key.compareTo("aaaa") < 0) {
return 1;
} else return 2;
}
}
3)采样 //对于纯文本数据支持不友好,纯文本的输入输出格式建议使用KeyValueTextInputFormat
//1、设置分区类TotalOrderPartition(MR中存在此类 )
//2、初始化采样器 => InputSampler.RandomSampler<Text,Text> sampler = new InputSampler.RandomSampler<Text,Text>(0.01,10);
SplitSampler
IntervalSampler
//3、设置采样数据地址 => TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("D:/"));
//4、写入采样数据 => InputSampler.writePartitionFile(job,sampler);
//5、注意1-4步必须写在配置文件之后,job执行之前
1.随机采样
比较耗费资源,浪费性能
2.切片采样
3. 间隔采样 :性能最好
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.partition.InputSampler;
import org.apache.hadoop.mapreduce.lib.partition.TotalOrderPartitioner;
public class PassApp {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "file:///");
FileSystem fs = FileSystem.get(conf);
//通过配置文件初始化job
Job job = Job.getInstance(conf);
//设置job名称
job.setJobName("word count");
//job入口函数类
job.setJarByClass(PassApp.class);
//设置mapper类
job.setMapperClass(PassMapper.class);
//设置reducer类
job.setReducerClass(PassReducer.class);
//设置全排序采样类TotalOrderPartitioner
job.setPartitionerClass(TotalOrderPartitioner.class);
//设置map的输出k-v类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置reduce的输出k-v类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//FileInputFormat.setMaxInputSplitSize(job,10);
//FileInputFormat.setMinInputSplitSize(job,10);
job.setInputFormatClass(KeyValueTextInputFormat.class);
//设置输入路径
FileInputFormat.addInputPath(job, new Path("D:/wc/out"));
//设置输出路径
FileOutputFormat.setOutputPath(job, new Path("D:/wc/out4"));
if(fs.exists(new Path("D:/wc/out4"))){
fs.delete(new Path("D:/wc/out4"),true);
}
//设置三个reduce
job.setNumReduceTasks(3);
/**
* 随机采样,比较浪费性能,耗费资源
* @param freq 每个key被选择的概率 ,大于采样数(2) / 所有key数量(100)
* @param numSamples 所有切片中需要选择的key数量
*/
//设置采样器类型
InputSampler.RandomSampler<Text,Text> sampler = new InputSampler.RandomSampler<Text,Text>(0.001,8800);
//InputSampler.SplitSampler<Text,Text> sampler = new InputSampler.SplitSampler<Text,Text>(10,3);
//InputSampler.IntervalSampler<Text,Text> sampler = new InputSampler.IntervalSampler<Text,Text>(0.001);
//设置采样数据地址
TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("D:/wc/par/"));
//写入采样数据
InputSampler.writePartitionFile(job,sampler);
//执行job
boolean b = job.waitForCompletion(true);
}
}
3.二次排序
在MapReduce完成后,在对key排序的基础上,再对value进行排序
以年度气温最高统计
1901 :10 20 30 50 40
1901:30 20 10 11 -8
对年份进行排序完成后,对气温再进行一个排序
实现方法:
1.自定义key,使年份_气温 变成一个key,自定义comkey 实现WritableComparable接口,实现自定义序列化和比较器(自定义排序算法)
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class CompKey implements WritableComparable<CompKey> {
private String year;
private int temp;
//定义排序规则
public int compareTo(CompKey o) {
String oyear = o.getYear();//第一个
String tyear = this.getYear();
int otemp = o.getTemp();
int ttemp = this.getTemp();
//如果参数year 和现在的year相同,则比较temp的大小
if (tyear.equals(oyear)) {
return otemp - ttemp;
}
//不同,返回两个year的比较值
return tyear.compareTo(oyear);
}
public void write(DataOutput out) throws IOException {
out.writeUTF(year);
out.writeInt(temp);
}
public void readFields(DataInput in) throws IOException {
this.setYear(in.readUTF());
this.setTemp(in.readInt());
}
@Override
public String toString() {
return "CompKey{" +
"year='" + year + '\'' +
", temp=" + temp +
'}';
}
public CompKey(String year, int temp) {
this.year = year;
this.temp = temp;
}
public CompKey() {
}
public String getYear() {
return year;
}
public void setYear(String year) {
this.year = year;
}
public int getTemp() {
return temp;
}
public void setTemp(int temp) {
this.temp = temp;
}
}
2.自定义分组对比器,将所有指定的key变成一个key,也就是说1920 30 ,1920 40这两个不同key识别成不同的key,这个分组对比器是在reduce端,重写WritableComparator中的MyGroupComparator和compar
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
/**
* reduce端 分组对比器,自定义key业务逻辑,将1902 20 和1902 30 识别为一个key
*/
public class MyGroupComparator extends WritableComparator {
//必须写,创建实例必须写true
protected MyGroupComparator() {
super(CompKey.class, true);
}
//比较算法
//只要year相等则证明两个key相等
@Override
public int compare(WritableComparable a, WritableComparable b) {
CompKey ck1 = (CompKey) a;
CompKey ck2 = (CompKey) b;
return ck1.getYear().compareTo(ck2.getYear());
}
}
在Mainapp中注册分组对比器
job.setGroupingComparatorClass(MyGroupComparator.class);
.