测试数据:
2030 59
1976 68
2030 19
1997 5
年与温度的文本,数据可以用java代码生成。
生成10000条数据代码:
public void makeData() throws IOException {
FileWriter fw = new FileWriter("e:/mr/tmp/temp.txt");
for (int i = 0; i < 10000;i++){
int year = 1970 + new Random().nextInt(100);
int temp = -30 + new Random().nextInt(100);
fw.write(""+year +" "+temp +"\r\n");
}
fw.close();
}
MapReduce全排序
1、应用场景
当需要从大量数据中获取某一最大值最小值时,就得进行排序,这样减少掉检索的时间,优化了程序的运行效率。
2、实现方式
1、定义一个Reduce
2、自定义分区函数
3、使用hadoop采样机制
3、代码
public static void main(String args[]) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
Job job = Job.getInstance(conf);
//设置job的各种属性
job.setJobName("MaxTempApp"); //作业名称
job.setJarByClass(MaxTempApp.class); //搜索类
job.setInputFormatClass(SequenceFileInputFormat.class); //设置输入格式
//设置输出格式类
//job.setOutputFormatClass(SequenceFileOutputFormat.class);
//添加输入路径
FileInputFormat.addInputPath(job, new Path(args[0]));
//设置输出路径
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//设置最大切片数
//FileInputFormat.setMaxInputSplitSize(job,1024);
//设置最小切片数
//FileInputFormat.setMinInputSplitSize(job,1);
//设置合成类 --不能取平均值
//job.setCombinerClass(MaxTempReducer.class);
job.setMapperClass(MaxTempMapper.class); //mapper类
job.setReducerClass(MaxTempReducer.class); //reducer类
//可以设置reduce个数为1
job.setNumReduceTasks(3); //reducer个数
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
//创建随机采样器对象
//freq:每个key被选中的概率
//numSamples:抽取的样本总数
//maxSplitsSampled:最大采样切片数(分区数)
InputSampler.Sampler<IntWritable,IntWritable> sampler = new InputSampler.RandomSampler<IntWritable, IntWritable>(0.1,6000,3);
//生成的文件value为空,key为采样的区间 例如:本次测试的显示内容2002年、2036年区间节点
//setPartitionFile(conf,path) 不要使用conf,设置job对象的conf(该对象的conf在底层重新创建)
TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("d:/mr/par.lst"));
//设置全排序分区类
job.setPartitionerClass(TotalOrderPartitioner.class); //设置自定义分区
//将sampler写入分区文件
InputSampler.writePartitionFile(job,sampler);
job.waitForCompletion(true);
}
MapReduce二次排序
1、应用场景
由于MapReduce只能对key排序,当需求是获取value的最大值最小值,对value进行排序称之为二次排序。
2、实现方式
1、自定义key
实现org.apache.hadoop.io.WritableComparable接口
2、自定义分区类
继承org.apache.hadoop.mapreduce.Partitioner类
3、定义分组对比起
继承org.apache.hadoop.io.WritableComparator类
4、定义自定义key的排序对比器
继承org.apache.hadoop.io.WritableComparator类
3、代码
自定义key
public class ComboKey implements WritableComparable<ComboKey> {
private int year ;
private int temp ;
public int getYear() {
return year;
}
public void setYear(int year) {
this.year = year;
}
public int getTemp() {
return temp;
}
public void setTemp(int temp) {
this.temp = temp;
}
/**
* 对key进行比较实现
*/
public int compareTo(ComboKey o) {
System.out.println("ComboKey.CompareTo "+ o.toString());
int y0 = o.getYear();
int t0 = o.getTemp() ;
//年份相同(升序)
if(year == y0){
//气温降序
return -(temp - t0) ;
}
else{
return year - y0 ;
}
}
/**
* 串行化过程
*/
public void write(DataOutput out) throws IOException {
//年份
out.writeInt(year);
//气温
out.writeInt(temp);
}
public void readFields(DataInput in) throws IOException {
year = in.readInt();
temp = in.readInt();
}
public String toString() {
return year+":"+temp;
}
}
自定义分区类
public class YearPartitioner extends Partitioner<ComboKey,NullWritable> {
public int getPartition(ComboKey key, NullWritable nullWritable, int numPartitions) {
int year = key.getYear();
return year % numPartitions;
}
}
自定义分组对比器
public class YearGroupComparator extends WritableComparator {
protected YearGroupComparator() {
super(ComboKey.class, true);
}
public int compare(WritableComparable a, WritableComparable b) {
System.out.println("YearGroupComparator"+a+","+b);
ComboKey k1 = (ComboKey)a ;
ComboKey k2 = (ComboKey)b ;
return k1.getYear() - k2.getYear() ;
}
}
自定义key排序对比器
public class ComboKeyComparator extends WritableComparator {
protected ComboKeyComparator() {
super(ComboKey.class, true);
}
public int compare(WritableComparable a, WritableComparable b) {
System.out.println("ComboKeyComparator"+a+","+b);
ComboKey k1 = (ComboKey) a;
ComboKey k2 = (ComboKey) b;
//对比方法在自定义key类中
return k1.compareTo(k2);
}
}
编写Mapper
public class MaxTempMapper extends Mapper<LongWritable,Text,ComboKey,NullWritable> {
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
System.out.println("MaxTempMapper.map");
String line = value.toString();
String arr[] = line.split(" ");
ComboKey keyOut = new ComboKey();
keyOut.setYear(Integer.parseInt(arr[0]));
keyOut.setTemp(Integer.parseInt(arr[1]));
context.write(keyOut,NullWritable.get());
}
}
编写Reduce
public class MaxTempReducer extends Reducer <ComboKey ,NullWritable, IntWritable ,IntWritable>{
protected void reduce(ComboKey key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
int year = key.getYear();
int temp = key.getTemp();
context.write(new IntWritable(year),new IntWritable(temp));
}
}
编写App
public class MaxTempApp {
public static void main(String args[]) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///");
Job job = Job.getInstance(conf);
//设置job的各种属性
job.setJobName("SecondarySortApp"); //作业名称
job.setJarByClass(MaxTempApp.class); //搜索类
job.setInputFormatClass(TextInputFormat.class); //设置输入格式
//添加输入路径
FileInputFormat.addInputPath(job,new Path(args[0]));
//设置输出路径
FileOutputFormat.setOutputPath(job,new Path(args[1]));
job.setMapperClass(MaxTempMapper.class); //mapper类
job.setReducerClass(MaxTempReducer.class); //reducer类
//设置Map输出类型
job.setMapOutputKeyClass(ComboKey.class);
job.setMapOutputValueClass(NullWritable.class);
//设置ReduceOutput类型
job.setOutputKeyClass(IntWritable.class);
job.setOutputValueClass(IntWritable.class);
//设置分区类
job.setPartitionerClass(YearPartitioner.class);
//设置分组对比器
job.setGroupingComparatorClass(YearGroupComparator.class);
//设置排序对比器
job.setSortComparatorClass(ComboKeyComparator.class);
job.setNumReduceTasks(3); //reduce个数
job.waitForCompletion(true);
}
}