-
Hadoop技术内幕中指出Top K算法有两步,一是统计词频,二是找出词频最高的前K个词。在网上找了很多MapReduce的Top K案例,这些案例都只有排序功能,所以自己写了个案例。
这个案例分两个步骤,第一个是就是wordCount案例,二就是排序功能。
一,统计词频
01.
1
package
TopK;
02.
2
import
java.io.IOException;
03.
3
import
java.util.StringTokenizer;
04.
4
05.
5
import
org.apache.hadoop.conf.Configuration;
06.
6
import
org.apache.hadoop.fs.Path;
07.
7
import
org.apache.hadoop.io.IntWritable;
08.
8
import
org.apache.hadoop.io.Text;
09.
9
import
org.apache.hadoop.mapreduce.Job;
10.
10
import
org.apache.hadoop.mapreduce.Mapper;
11.
11
import
org.apache.hadoop.mapreduce.Reducer;
12.
12
import
org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
13.
13
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
14.
14
15.
15
/**
16.
16 * 统计词频
17.
17 * @author zx
18.
18 * zhangxian1991@<a href="http://www.it165.net/qq/" target="_blank" class="keylink">qq</a>.com
19.
19 */
20.
20
public
class
WordCount {
21.
21
22.
22
/**
23.
23 * 读取单词
24.
24 * @author zx
25.
25 *
26.
26 */
27.
27
public
static
class
Map
extends
Mapper<Object,Text,Text,IntWritable>{
28.
28
29.
29
IntWritable count =
new
IntWritable(
1
);
30.
30
31.
31
@Override
32.
32
protected
void
map(Object key, Text value, Context context)
33.
33
throws
IOException, InterruptedException {
34.
34
StringTokenizer st =
new
StringTokenizer(value.toString());
35.
35
while
(st.hasMoreTokens()){
36.
36
String word = st.nextToken().replaceAll(
"\""
,
""
).replace(
"'"
,
""
).replace(
"."
,
""
);
37.
37
context.write(
new
Text(word), count);
38.
38
}
39.
39
}
40.
40
41.
41
}
42.
42
43.
43
/**
44.
44 * 统计词频
45.
45 * @author zx
46.
46 *
47.
47 */
48.
48
public
static
class
Reduce
extends
Reducer<Text,IntWritable,Text,IntWritable>{
49.
49
50.
50
@SuppressWarnings
(
"unused"
)
51.
51
@Override
52.
52
protected
void
reduce(Text key, Iterable<IntWritable> values,Context context)
53.
53
throws
IOException, InterruptedException {
54.
54
int
count =
0
;
55.
55
for
(IntWritable intWritable : values) {
56.
56
count ++;
57.
57
}
58.
58
context.write(key,
new
IntWritable(count));
59.
59
}
60.
60
61.
61
}
62.
62
63.
63
@SuppressWarnings
(
"deprecation"
)
64.
64
public
static
boolean
run(String in,String out)
throws
IOException, ClassNotFoundException, InterruptedException{
65.
65
66.
66
Configuration conf =
new
Configuration();
67.
67
68.
68
Job job =
new
Job(conf,
"WordCount"
);
69.
69
job.setJarByClass(WordCount.
class
);
70.
70
job.setMapperClass(Map.
class
);
71.
71
job.setReducerClass(Reduce.
class
);
72.
72
73.
73
// 设置Map输出类型
74.
74
job.setMapOutputKeyClass(Text.
class
);
75.
75
job.setMapOutputValueClass(IntWritable.
class
);
76.
76
77.
77
// 设置Reduce输出类型
78.
78
job.setOutputKeyClass(Text.
class
);
79.
79
job.setOutputValueClass(IntWritable.
class
);
80.
80
81.
81
// 设置输入和输出目录
82.
82
FileInputFormat.addInputPath(job,
new
Path(in));
83.
83
FileOutputFormat.setOutputPath(job,
new
Path(out));
84.
84
85.
85
return
job.waitForCompletion(
true
);
86.
86
}
87.
87
88.
88
}
二,排序 并求出频率最高的前K个词
001.
1
package
TopK;
002.
2
003.
3
import
java.io.IOException;
004.
4
import
java.util.Comparator;
005.
5
import
java.util.Map.Entry;
006.
6
import
java.util.Set;
007.
7
import
java.util.StringTokenizer;
008.
8
import
java.util.TreeMap;
009.
9
import
java.util.regex.Pattern;
010.
10
011.
11
import
org.apache.hadoop.conf.Configuration;
012.
12
import
org.apache.hadoop.fs.Path;
013.
13
import
org.apache.hadoop.io.IntWritable;
014.
14
import
org.apache.hadoop.io.Text;
015.
15
import
org.apache.hadoop.mapreduce.Job;
016.
16
import
org.apache.hadoop.mapreduce.Mapper;
017.
17
import
org.apache.hadoop.mapreduce.Reducer;
018.
18
import
org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
019.
19
import
org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
020.
20
import
org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
021.
21
import
org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
022.
22
023.
23
/**
024.
24 * 以单词出现的频率排序
025.
25 *
026.
26 * @author zx
027.
27 * zhangxian1991@<a href="http://www.it165.net/qq/" target="_blank" class="keylink">qq</a>.com
028.
28 */
029.
29
public
class
Sort {
030.
30
031.
31
/**
032.
32 * 读取单词(词频 <a href="http://www.it165.net/edu/ebg/" target="_blank" class="keylink">word</a>)
033.
33 *
034.
34 * @author zx
035.
35 *
036.
36 */
037.
37
public
static
class
Map
extends
Mapper<Object, Text, IntWritable, Text> {
038.
38
039.
39
// 输出key 词频
040.
40
IntWritable outKey =
new
IntWritable();
041.
41
Text outValue =
new
Text();
042.
42
043.
43
@Override
044.
44
protected
void
map(Object key, Text value, Context context)
045.
45
throws
IOException, InterruptedException {
046.
46
047.
47
StringTokenizer st =
new
StringTokenizer(value.toString());
048.
48
while
(st.hasMoreTokens()) {
049.
49
String element = st.nextToken();
050.
50
if
(Pattern.matches(
"\\d+"
, element)) {
051.
51
outKey.set(Integer.parseInt(element));
052.
52
}
else
{
053.
53
outValue.set(element);
054.
54
}
055.
55
}
056.
56
057.
57
context.write(outKey, outValue);
058.
58
}
059.
59
060.
60
}
061.
61
062.
62
/**
063.
63 * 根据词频排序
064.
64 *
065.
65 * @author zx
066.
66 *
067.
67 */
068.
68
public
static
class
Reduce
extends
069.
69
Reducer<IntWritable, Text, Text, IntWritable> {
070.
70
071.
71
private
static
MultipleOutputs<Text, IntWritable> mos =
null
;
072.
72
073.
73
//要获得前K个频率最高的词
074.
74
private
static
final
int
k =
10
;
075.
75
076.
76
//用TreeMap存储可以利用它的排序功能
077.
77
//这里用 MyInt 因为TreeMap是对key排序,且不能唯一,而词频可能相同,要以词频为Key就必需对它封装
078.
78
private
static
TreeMap<MyInt, String> tm =
new
TreeMap<MyInt, String>(
new
Comparator<MyInt>(){
079.
79
/**
080.
80 * 默认是从小到大的顺序排的,现在修改为从大到小
081.
81 * @param o1
082.
82 * @param o2
083.
83 * @return
084.
84 */
085.
85
@Override
086.
86
public
int
compare(MyInt o1, MyInt o2) {
087.
87
return
o2.compareTo(o1);
088.
88
}
089.
89
090.
90
}) ;
091.
91
092.
92
/*
093.
93 * 以词频为Key是要用到reduce的排序功能
094.
94 */
095.
95
@Override
096.
96
protected
void
reduce(IntWritable key, Iterable<Text> values,
097.
97
Context context)
throws
IOException, InterruptedException {
098.
98
for
(Text text : values) {
099.
99
context.write(text, key);
100.
100
tm.put(
new
MyInt(key.get()),text.toString());
101.
101
102.
102
//TreeMap以对内部数据进行了排序,最后一个必定是最小的
103.
103
if
(tm.size() > k){
104.
104
tm.remove(tm.lastKey());
105.
105
}
106.
106
107.
107
}
108.
108
}
109.
109
110.
110
@Override
111.
111
protected
void
cleanup(Context context)
112.
112
throws
IOException, InterruptedException {
113.
113
String path = context.getConfiguration().get(
"topKout"
);
114.
114
mos =
new
MultipleOutputs<Text, IntWritable>(context);
115.
115
Set<Entry<MyInt, String>> set = tm.entrySet();
116.
116
for
(Entry<MyInt, String> entry : set) {
117.
117
mos.write(
"topKMOS"
,
new
Text(entry.getValue()),
new
IntWritable(entry.getKey().getValue()), path);
118.
118
}
119.
119
mos.close();
120.
120
}
121.
121
122.
122
123.
123
124.
124
}
125.
125
126.
126
@SuppressWarnings
(
"deprecation"
)
127.
127
public
static
void
run(String in, String out,String topKout)
throws
IOException,
128.
128
ClassNotFoundException, InterruptedException {
129.
129
130.
130
Path outPath =
new
Path(out);
131.
131
132.
132
Configuration conf =
new
Configuration();
133.
133
134.
134
//前K个词要输出到哪个目录
135.
135
conf.set(
"topKout"
,topKout);
136.
136
137.
137
Job job =
new
Job(conf,
"Sort"
);
138.
138
job.setJarByClass(Sort.
class
);
139.
139
job.setMapperClass(Map.
class
);
140.
140
job.setReducerClass(Reduce.
class
);
141.
141
142.
142
// 设置Map输出类型
143.
143
job.setMapOutputKeyClass(IntWritable.
class
);
144.
144
job.setMapOutputValueClass(Text.
class
);
145.
145
146.
146
// 设置Reduce输出类型
147.
147
job.setOutputKeyClass(Text.
class
);
148.
148
job.setOutputValueClass(IntWritable.
class
);
149.
149
150.
150
//设置MultipleOutputs的输出格式
151.
151
//这里利用MultipleOutputs进行对文件输出
152.
152
MultipleOutputs.addNamedOutput(job,
"topKMOS"
,TextOutputFormat.
class
,Text.
class
,Text.
class
);
153.
153
154.
154
// 设置输入和输出目录
155.
155
FileInputFormat.addInputPath(job,
new
Path(in));
156.
156
FileOutputFormat.setOutputPath(job, outPath);
157.
157
job.waitForCompletion(
true
);
158.
158
159.
159
}
160.
160
161.
161
}
自己封装的Int
01.
1
package
TopK;
02.
2
03.
3
public
class
MyInt
implements
Comparable<MyInt>{
04.
4
private
Integer value;
05.
5
06.
6
public
MyInt(Integer value){
07.
7
this
.value = value;
08.
8
}
09.
9
10.
10
public
int
getValue() {
11.
11
return
value;
12.
12
}
13.
13
14.
14
public
void
setValue(
int
value) {
15.
15
this
.value = value;
16.
16
}
17.
17
18.
18
@Override
19.
19
public
int
compareTo(MyInt o) {
20.
20
return
value.compareTo(o.getValue());
21.
21
}
22.
22
23.
23
24.
24
}
运行入口
01.
1
package
TopK;
02.
2
03.
3
import
java.io.IOException;
04.
4
05.
5
/**
06.
6 *
07.
7 * @author zx
08.
8 *zhangxian1991@qq.com
09.
9 */
10.
10
public
class
TopK {
11.
11
public
static
void
main(String args[])
throws
ClassNotFoundException, IOException, InterruptedException{
12.
12
13.
13
//要统计字数,排序的文字
14.
14
String in =
"hdfs://localhost:9000/input/MaDing.text"
;
15.
15
16.
16
//统计字数后的结果
17.
17
String <a href=
"http://www.it165.net/edu/ebg/"
target=
"_blank"
class
=
"keylink"
>word</a>Cout =
"hdfs://localhost:9000/out/wordCout"
;
18.
18
19.
19
//对统计完后的结果再排序后的内容
20.
20
String sort =
"hdfs://localhost:9000/out/sort"
;
21.
21
22.
22
//前K条
23.
23
String topK =
"hdfs://localhost:9000/out/topK"
;
24.
24
25.
25
//如果统计字数的job完成后就开始排序
26.
26
if
(WordCount.run(in, wordCout)){
27.
27
Sort.run(wordCout, sort,topK);
28.
28
}
29.
29
30.
30
}
31.
31
}
MapReduce TopK统计加排序
最新推荐文章于 2024-08-27 06:39:08 发布