原创作品,允许转载,转载时请务必以超链接形式标明文章
原始出处 、作者信息和本声明。否则将追究法律责任。
http://computerdragon.blog.51cto.com/6235984/1287721
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
|
package
whut;
import
java.io.DataInput;
import
java.io.DataOutput;
import
java.io.IOException;
import
org.apache.hadoop.io.IntWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.io.WritableComparable;
//自定义组合键策略
//java基本类型数据
public
class
TextInt
implements
WritableComparable{
//直接利用java的基本数据类型
private
String firstKey;
private
int
secondKey;
//必须要有一个默认的构造函数
public
String getFirstKey() {
return
firstKey;
}
public
void
setFirstKey(String firstKey) {
this
.firstKey = firstKey;
}
public
int
getSecondKey() {
return
secondKey;
}
public
void
setSecondKey(
int
secondKey) {
this
.secondKey = secondKey;
}
@Override
public
void
write(DataOutput out)
throws
IOException {
// TODO Auto-generated method stub
out.writeUTF(firstKey);
out.writeInt(secondKey);
}
@Override
public
void
readFields(DataInput in)
throws
IOException {
// TODO Auto-generated method stub
firstKey=in.readUTF();
secondKey=in.readInt();
}
//map的键的比较就是根据这个方法来进行的
@Override
public
int
compareTo(Object o) {
// TODO Auto-generated method stub
TextInt ti=(TextInt)o;
//利用这个来控制升序或降序
//this本对象写在前面代表是升序
//this本对象写在后面代表是降序
return
this
.getFirstKey().compareTo(ti.getFirstKey());
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
|
package
whut;
import
org.apache.hadoop.io.WritableComparable;
import
org.apache.hadoop.io.WritableComparator;
//主要就是对于分组进行排序,分组只按照组建键中的一个值进行分组
public
class
TextComparator
extends
WritableComparator {
//必须要调用父类的构造器
protected
TextComparator() {
super
(TextInt.
class
,
true
);
//注册comparator
}
@Override
public
int
compare(WritableComparable a, WritableComparable b) {
// TODO Auto-generated method stub
TextInt ti1=(TextInt)a;
TextInt ti2=(TextInt)b;
return
ti1.getFirstKey().compareTo(ti2.getFirstKey());
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
|
package
whut;
import
org.apache.hadoop.io.WritableComparable;
import
org.apache.hadoop.io.WritableComparator;
//分组内部进行排序,按照第二个字段进行排序
public
class
TextIntComparator
extends
WritableComparator {
public
TextIntComparator()
{
super
(TextInt.
class
,
true
);
}
//这里可以进行排序的方式管理
//必须保证是同一个分组的
//a与b进行比较
//如果a在前b在后,则会产生升序
//如果a在后b在前,则会产生降序
@Override
public
int
compare(WritableComparable a, WritableComparable b) {
// TODO Auto-generated method stub
TextInt ti1=(TextInt)a;
TextInt ti2=(TextInt)b;
//首先要保证是同一个组内,同一个组的标识就是第一个字段相同
if
(!ti1.getFirstKey().equals(ti2.getFirstKey()))
return
ti1.getFirstKey().compareTo(ti2.getFirstKey());
else
return
ti2.getSecondKey()-ti1.getSecondKey();
//0,-1,1
}
}
|
1
2
3
4
5
6
7
8
9
10
11
|
package
whut;
import
org.apache.hadoop.io.IntWritable;
import
org.apache.hadoop.mapreduce.Partitioner;
//参数为map的输出类型
public
class
KeyPartitioner
extends
Partitioner<TextInt, IntWritable> {
@Override
public
int
getPartition(TextInt key, IntWritable value,
int
numPartitions) {
// TODO Auto-generated method stub
return
(key.getFirstKey().hashCode()&Integer.MAX_VALUE)%numPartitions;
}
}
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
|
package
whut;
import
java.io.IOException;
import
org.apache.hadoop.conf.Configuration;
import
org.apache.hadoop.conf.Configured;
import
org.apache.hadoop.fs.Path;
import
org.apache.hadoop.io.IntWritable;
import
org.apache.hadoop.io.Text;
import
org.apache.hadoop.mapreduce.Job;
import
org.apache.hadoop.mapreduce.Mapper;
import
org.apache.hadoop.mapreduce.Reducer;
import
org.apache.hadoop.mapreduce.Mapper.Context;
import
org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import
org.apache.hadoop.mapreduce.lib.input.KeyValueTextInputFormat;
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import
org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import
org.apache.hadoop.util.Tool;
import
org.apache.hadoop.util.ToolRunner;
//需要对数据进行分组以及组内排序的时候
public
class
SortMain
extends
Configured
implements
Tool{
//这里设置输入文格式为KeyValueTextInputFormat
//name1 5
//默认输入格式都是Text,Text
public
static
class
GroupMapper
extends
Mapper<Text, Text, TextInt, IntWritable> {
public
IntWritable second=
new
IntWritable();
public
TextInt tx=
new
TextInt();
@Override
protected
void
map(Text key, Text value, Context context)
throws
IOException, InterruptedException {
String lineKey=key.toString();
String lineValue=value.toString();
int
lineInt=Integer.parseInt(lineValue);
tx.setFirstKey(lineKey);
tx.setSecondKey(lineInt);
second.set(lineInt);
context.write(tx, second);
}
}
//设置reduce
public
static
class
GroupReduce
extends
Reducer<TextInt, IntWritable, Text, Text>
{
@Override
protected
void
reduce(TextInt key, Iterable<IntWritable> values,
Context context)
throws
IOException, InterruptedException {
StringBuffer sb=
new
StringBuffer();
for
(IntWritable val:values)
{
sb.append(val+
","
);
}
if
(sb.length()>
0
)
{
sb.deleteCharAt(sb.length()-
1
);
}
context.write(
new
Text(key.getFirstKey()),
new
Text(sb.toString()));
}
}
@Override
public
int
run(String[] args)
throws
Exception {
// TODO Auto-generated method stub
Configuration conf=getConf();
Job job=
new
Job(conf,
"SecondarySort"
);
job.setJarByClass(SortMain.
class
);
// 设置输入文件的路径,已经上传在HDFS
FileInputFormat.addInputPath(job,
new
Path(args[
0
]));
// 设置输出文件的路径,输出文件也存在HDFS中,但是输出目录不能已经存在
FileOutputFormat.setOutputPath(job,
new
Path(args[
1
]));
job.setMapperClass(GroupMapper.
class
);
job.setReducerClass(GroupReduce.
class
);
//设置分区方法
job.setPartitionerClass(KeyPartitioner.
class
);
//下面这两个都是针对map端的
//设置分组的策略,哪些key可以放置到一组中
job.setGroupingComparatorClass(TextComparator.
class
);
//设置key如何进行排序在传递给reducer之前.
//这里就可以设置对组内如何排序的方法
/*************关键点**********/
job.setSortComparatorClass(TextIntComparator.
class
);
//设置输入文件格式
job.setInputFormatClass(KeyValueTextInputFormat.
class
);
//使用默认的输出格式即TextInputFormat
//设置map的输出key和value类型
job.setMapOutputKeyClass(TextInt.
class
);
job.setMapOutputValueClass(IntWritable.
class
);
//设置reduce的输出key和value类型
//job.setOutputFormatClass(TextOutputFormat.class);
job.setOutputKeyClass(Text.
class
);
job.setOutputValueClass(Text.
class
);
job.waitForCompletion(
true
);
int
exitCode=job.isSuccessful()?
0
:
1
;
return
exitCode;
}
public
static
void
main(String[] args)
throws
Exception
{
int
exitCode=ToolRunner.run(
new
SortMain(), args);
System.exit(exitCode);
}
}
|