原创作品,允许转载,转载时请务必以超链接形式标明文章
原始出处 、作者信息和本声明。否则将追究法律责任。
http://computerdragon.blog.51cto.com/6235984/1287721
package whut;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
//自定义组合键策略
//java基本类型数据
public class TextInt implements WritableComparable{
//直接利用java的基本数据类型
private String firstKey;
private int secondKey;
//必须要有一个默认的构造函数
public String getFirstKey() {
return firstKey;
}
public void setFirstKey(String firstKey) {
this.firstKey = firstKey;
}
public int getSecondKey() {
return secondKey;
}
public void setSecondKey(int secondKey) {
this.secondKey = secondKey;
}
@Override
public void write(DataOutput out) throws IOException {
// TODO Auto-generated method stub
out.writeUTF(firstKey);
out.writeInt(secondKey);
}
@Override
public void readFields(DataInput in) throws IOException {
// TODO Auto-generated method stub
firstKey=in.readUTF();
secondKey=in.readInt();
}
//map的键的比较就是根据这个方法来进行的
@Override
public int compareTo(Object o) {
// TODO Auto-generated method stub
TextInt ti=(TextInt)o;
//利用这个来控制升序或降序
//this本对象写在前面代表是升序
//this本对象写在后面代表是降序
return this.getFirstKey().compareTo(ti.getFirstKey());
}
}
package whut;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
//主要就是对于分组进行排序,分组只按照组建键中的一个值进行分组
public class TextComparator extends WritableComparator {
//必须要调用父类的构造器
protected TextComparator() {
super(TextInt.class,true);//注册comparator
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
// TODO Auto-generated method stub
TextInt ti1=(TextInt)a;
TextInt ti2=(TextInt)b;
return ti1.getFirstKey().compareTo(ti2.getFirstKey());
}
}
package whut;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
//分组内部进行排序,按照第二个字段进行排序
public class TextIntComparator extends WritableComparator {
public TextIntComparator()
{
super(TextInt.class,true);
}
//这里可以进行排序的方式管理
//必须保证是同一个分组的
//a与b进行比较
//如果a在前b在后,则会产生升序
//如果a在后b在前,则会产生降序
@Override
public int compare(WritableComparable a, WritableComparable b) {
// TODO Auto-generated method stub
TextInt ti1=(TextInt)a;
TextInt ti2=(TextInt)b;
//首先要保证是同一个组内,同一个组的标识就是第一个字段相同
if(!ti1.getFirstKey().equals(ti2.getFirstKey()))
return ti1.getFirstKey().compareTo(ti2.getFirstKey());
else
return ti2.getSecondKey()-ti1.getSecondKey();//0,-1,1
}
}
package whut;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.mapreduce.Partitioner;
//参数为map的输出类型
public class KeyPartitioner extends Partitioner<TextInt, IntWritable> {
@Override
public int getPartition(TextInt key, IntWritable value, int numPartitions) {
// TODO Auto-generated method stub
return (key.getFirstKey().hashCode()&Integer.MAX_VALUE)%numPartitions;
}
}
package whut;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
//需要对数据进行分组以及组内排序的时候
public class SortMain extends Configured implements Tool{
//这里设置输入文格式为KeyValueTextInputFormat
//name1 5
//默认输入格式都是Text,Text
public static class GroupMapper extends
Mapper<Text, Text, TextInt, IntWritable> {
public IntWritable second=new IntWritable();
public TextInt tx=new TextInt();
@Override
protected void map(Text key, Text value, Context context)
throws IOException, InterruptedException {
String lineKey=key.toString();
String lineValue=value.toString();
int lineInt=Integer.parseInt(lineValue);
tx.setFirstKey(lineKey);
tx.setSecondKey(lineInt);
second.set(lineInt);
context.write(tx, second);
}
}
//设置reduce
public static class GroupReduce extends Reducer<TextInt, IntWritable, Text, Text>
{
@Override
protected void reduce(TextInt key, Iterable<IntWritable> values,
Context context)
throws IOException, InterruptedException {
StringBuffer sb=new StringBuffer();
for(IntWritable val:values)
{
sb.append(val+",");
}
if(sb.length()>0)
{
sb.deleteCharAt(sb.length()-1);
}
context.write(new Text(key.getFirstKey()), new Text(sb.toString()));
}
}
@Override
public int run(String[] args) throws Exception {
// TODO Auto-generated method stub
Configuration conf=getConf();
Job job=new Job(conf,"SecondarySort");
job.setJarByClass(SortMain.class);
// 设置输入文件的路径,已经上传在HDFS
FileInputFormat.addInputPath(job, new Path(args[0]));
// 设置输出文件的路径,输出文件也存在HDFS中,但是输出目录不能已经存在
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(GroupMapper.class);
job.setReducerClass(GroupReduce.class);
//设置分区方法
job.setPartitionerClass(KeyPartitioner.class);
//下面这两个都是针对map端的
//设置分组的策略,哪些key可以放置到一组中
job.setGroupingComparatorClass(TextComparator.class);
//设置key如何进行排序在传递给reducer之前.
//这里就可以设置对组内如何排序的方法
/*************关键点**********/
job.setSortComparatorClass(TextIntComparator.class);
//设置输入文件格式
job.setInputFormatClass(KeyValueTextInputFormat.class);
//使用默认的输出格式即TextInputFormat
//设置map的输出key和value类型
job.setMapOutputKeyClass(TextInt.class);
job.setMapOutputValueClass(IntWritable.class);
//设置reduce的输出key和value类型
//job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
job.waitForCompletion(true);
int exitCode=job.isSuccessful()?0:1;
return exitCode;
}
public static void main(String[] args) throws Exception
{
int exitCode=ToolRunner.run(new SortMain(), args);
System.exit(exitCode);
}
}