1.定义1个reduce
2.自定义分区函数.
自行设置分解区间(例如按条件分为三个区)
public int getPartition(IntWritable year,IntWritable temp,int parts) {
int y = year.get() - 1970;
if (y<33){
return 0;
}else if (y >= 33 && y<66){
return 1;
}
else {
return 2;
}
/**
* 动态分区
int width = 100/parts;
int y0 = year.get()-1970;
return y0/parts;
*/
}
3.使用hadoop采样机制。
通过采样器生成分区文件,结合hadoop的TotalOrderPartitioner进行分区划分。
TotalOrderPartitioner //全排序分区类,读取外部生成的分区文件确定区间。
使用hadoop提供TotalOrderPartitioner + (对inputformat)RandomSampler(采样器)
使用时采样代码在最后端,否则会出现错误。
//分区文件设置,设置的job的配置对象,不要是之前的conf.
TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("d:/mr/par.lst"));
app类:
package com.mao.hdfs.maxtemp;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.lib.InputSampler;
import org.apache.hadoop.mapred.lib.TotalOrderPartitioner;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class MaxTempApp {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS","file:///"); //本地运行
Job job = Job.getInstance(conf);
//设置job属性
job.setJobName("MaxTempApp"); //作业名称
job.setJarByClass(MaxTempApp.class); //搜索类
job.setInputFormatClass(SequenceFileInputFormat.class); //输入格式
FileInputFormat.addInputPath(job,new Path(args[0])); //输入路径
FileOutputFormat.setOutputPath(job,new Path(args[1])); //输出路径
job.setMapperClass(MaxTempMapper.class); //mapper类
job.setReducerClass(MaxTempReducer.class); //reduce类
job.setNumReduceTasks(3); //reduce个数
//设置Map输出类型
job.setMapOutputKeyClass(IntWritable.class);
job.setMapOutputValueClass(IntWritable.class);
//设置ReduceOutput类型
job.setOutputKeyClass(IntWritable.class); //输出key
job.setOutputValueClass(IntWritable.class); //输出value
//设置全排序分区类
job.setPartitionerClass(TotalOrderPartitioner.class);
TotalOrderPartitioner.setPartitionFile(job.getConfiguration(),new Path("file:///d:/mr/par.lst"));
//创建随机采样器对象
//freq: 每个key被选中的概率
//numSample: 抽取样本的总数
//maxSplitSampled: 最大采样切片数
InputSampler.Sampler<IntWritable, IntWritable> sampler
= new InputSampler.RandomSampler<IntWritable, IntWritable>(0.1,3000,3);
//将sample数据写入分区分件
InputSampler.writePartitionFile(job,sampler);
job.waitForCompletion(true);
}
}
mapper类:
package com.mao.hdfs.maxtemp;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* MaxTempMapper
*/
public class MaxTempMapper extends Mapper<IntWritable, IntWritable, IntWritable, IntWritable> {
protected void map(IntWritable key, IntWritable value, Context context) throws IOException, InterruptedException {
context.write(key,value);
}
}
reduce类:
package com.mao.hdfs.maxtemp;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/**
* MaxTempReducer
*/
public class MaxTempReducer extends Reducer<IntWritable, IntWritable,IntWritable,IntWritable> {
protected void reduce(IntWritable key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int max = Integer.MIN_VALUE;
for (IntWritable iw : values){
max = max > iw.get() ? max : iw.get();
}
context.write(key,new IntWritable(max));
}
}