功能:同内存排序求最大值,只是中间结果不在内存中,避免了数量过大造成内存溢出的问题。
知识点 分组比较器的使用
job.setGroupingComparatorClass(GroupingComparator.class);
自定义mr类SSData
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class SSData implements WritableComparable<SSData>{
private int first;
private int second;
public SSData(){
}
public SSData(int first, int second) {
this.first = first;
this.second = second;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(first);
out.writeInt(second);
}
@Override
public void readFields(DataInput in) throws IOException {
this.first = in.readInt();
this.second = in.readInt();
}
@Override
public int compareTo(SSData o) {
int tmp = this.first - o.first; //第一列jiang序
if(tmp != 0){
return tmp;
}
//
//return o.second.compareTo(this.second);
return o.second - this.second; //第2列jiang序
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + first;
result = prime * result + second;
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
SSData other = (SSData) obj;
if (first != other.first)
return false;
if (second != other.second)
return false;
return true;
}
/**
* @return the first
*/
public int getFirst() {
return first;
}
/**
* @param first the first to set
*/
public void setFirst(int first) {
this.first = first;
}
/**
* @return the second
*/
public int getSecond() {
return second;
}
/**
* @param second the second to set
*/
public void setSecond(int second) {
this.second = second;
}
@Override
public String toString() {
return "[ "+first + " " + second+" ]";
}
}
自定义分组比较器
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator;
/**
* 分组比较器
* 1、要求自定分组比较器实现RawComparator
* 2、实现两个比较方法
* 3、注意被比较属性的类型(可变长度不行)
* @author lyd
*
*/
public class GroupingComparator implements RawComparator<SSData>{
//和自定义数据类型里面的compareTo()方法一样。使用的是对象比较
@Override
public int compare(SSData o1, SSData o2) {
return 0;
}
/**
* 使用字节比较
*/
@Override
public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {
return WritableComparator.compareBytes(b1, s1, 8, b2, s2, 8);
}
}
MapReduce类FindMax2
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class FindMax2 extends ToolRunner implements Tool{
/**
* 自定义的myMapper
* @author lyd
*
*/
static class MyMapper extends Mapper<LongWritable, Text, SSData, IntWritable>{
@Override
protected void setup(Context context)throws IOException, InterruptedException {
}
@Override
protected void map(LongWritable key, Text value,Context context)
throws IOException, InterruptedException {
String line = value.toString();
String lines [] = line.split(" ");
//SSData ss = new SSData(Integer.parseInt(lines[0]), Integer.parseInt(lines[1]));
SSData ss = new SSData(Integer.parseInt(lines[1]), Integer.parseInt(lines[0]));
context.write(ss, new IntWritable(Integer.parseInt(lines[0])));
/**
*
*/
}
@Override
protected void cleanup(Context context)throws IOException, InterruptedException {
}
}
/**
* 自定义MyReducer
* @author lyd
*
*/
static class MyReducer extends Reducer<SSData, IntWritable, Text, Text>{
@Override
protected void setup(Context context)throws IOException, InterruptedException {
}
@Override
protected void reduce(SSData key, Iterable<IntWritable> value,Context context)
throws IOException, InterruptedException {
context.write(new Text("---------------"), new Text(""));
context.write(new Text(key.getFirst()+""), new Text(key.getSecond()+""));
}
@Override
protected void cleanup(Context context)throws IOException, InterruptedException {
}
}
@Override
public void setConf(Configuration conf) {
conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
}
@Override
public Configuration getConf() {
return new Configuration();
}
/**
* 驱动方法
*/
@Override
public int run(String[] args) throws Exception {
//1、获取conf对象
Configuration conf = getConf();
//2、创建job
Job job = Job.getInstance(conf, "model01");
//3、设置运行job的class
job.setJarByClass(FindMax2.class);
//4、设置map相关属性
job.setMapperClass(MyMapper.class);
job.setMapOutputKeyClass(SSData.class);
job.setMapOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
//设置分组比较器
job.setGroupingComparatorClass(GroupingComparator.class);
//5、设置reduce相关属性
job.setReducerClass(MyReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
//判断输出目录是否存在,若存在则删除
FileSystem fs = FileSystem.get(conf);
if(fs.exists(new Path(args[1]))){
fs.delete(new Path(args[1]), true);
}
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//6、提交运行job
int isok = job.waitForCompletion(true) ? 0 : 1;
return isok;
}
/**
* job的主入口
* @param args
*/
public static void main(String[] args) {
try {
//对输入参数作解析
String [] argss = new GenericOptionsParser(new Configuration(), args).getRemainingArgs();
System.exit(ToolRunner.run(new FindMax2(), argss));
} catch (Exception e) {
e.printStackTrace();
}
}
}