昨天部署好hadoop平台,今天做一个简单demo
从网上下载几个篇幅大的英文小说做了英文单词词频统计。代码如下
Mapper类
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* Mapper类
*
* @author taizhimin
*
*/
public class WordCountMapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
private Text kt = new Text();
private final static IntWritable vt = new IntWritable(1);
// Map方法
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String word = value.toString();
word = word.toLowerCase();
word = word.replaceAll("[^a-z]", " ");
String[] arr = word.split(" ");
for (int i = 0; i < arr.length; i++) {
kt.set(arr[i]);
context.write(kt, vt);
}
}
}
Reduce类
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* Reduce类
*
* @author taizhimin
*
*/
public class WordCountReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable vt = new IntWritable();
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable intVal : values) {
sum += intVal.get();
}
vt.set(sum);
context.write(key, vt);
}
}
作业类
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 作业类
*
* @author taizhimin
*
*/
public class WordCount {
public static void main(String[] args) throws Exception {
// 作业
Job job = new Job();
job.setJarByClass(WordCount.class);
job.setJobName("WordCount");
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.out.println(job.waitForCompletion(true));
}
}
统计结果前一百个单词,最后请教大神怎么把结果按照词频排序,这个是我粘贴到excel做的排序,总共3万多个单词。
the 72843
of 31328
and 30296
to 27180
a 26360
he 20489
in 18352
was 16387
his 15130
that 13632
it 12314
had 11745
i 10300
you 9614
with 8842
is 8698
her 7747
on 7506
him 7169
as 7063
at 7043
not 7043
s 6561
she 6548
for 6413
which 5758
this 5748
said 5211
but 5059
have 4941
one 4877
they 4828
be 4726
from 4465
all 4383
there 4176
by 3880
no 3880
were 3793
who 3689
my 3600
man 3528
are 3267
what 3266
an 3200
me 3172
would 3042
when 3016
so 2905
been 2810
them 2734
we 2707
their 2577
out 2546
will 2518
up 2362
your 2289
if 2250
then 2112
do 2096
could 2092
t 2090
did 2043
more 2022
like 1994
into 1967
two 1900
or 1874
than 1694
don 1672
has 1643
himself 1635
old 1627
now 1611
only 1593
father 1574
men 1554
time 1551
little 1529
made 1527
other 1508
marius 1471
some 1469
m 1466
back 1422
about 1394
its 1383
these 1382
hand 1369
lord 1359
over 1356
very 1342
good 1321
even 1307
eyes 1306
see 1301
know 1300
jean 1275
here 1270
can 1259