其实统计文章中单词的个数问题是我们曾经学习任何一门语言都可能遇到的例子,这个例子在实际业务场景中可能扩展为:分析网站发表的文章是否包含过激言论(言论中包含敏感词组或单词,国家领导人名字等)
有人说要统计一片文章单词的出现次数,非常简单嘛,用一个Map就搞定了,可是这里我们提到hadoop,就告诉你,我们要处理的数据量可不是MB,GB级别的了
下面是这个例子的具体实现,都有注释。由于我使用了Eclipse搭建的环境,所有直接Run on Hadoop ,当然,你也可以将java文件打成jar包后自己写命令行运行。
package thinking4java;
import java.util.StringTokenizer;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCountDemo {
/**@author thinking4java
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
if (args == null || args.length != 2) {
System.err.println("Lack input && output args, pls Usage: WordCountDemo <input path> <output path>");
System.exit(-1);
}
Job job = new Job();
job.setJarByClass(WordCountDemo.class);
// 为job指定输入文件
FileInputFormat.addInputPath(job, new Path(args[0]));
// 为job指定输出文件
FileOutputFormat.setOutputPath(job, new Path(args[1]));
// mapper
job.setMapperClass(WordCounterMapper.class);
// reducer
job.setReducerClass(WordCounterReducer.class);
// 输出key类型
job.setOutputKeyClass(Text.class);
// 输出value类型
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
/**
* mapper
* @author thinking4java
*
*/
private static class WordCounterMapper extends
Mapper<LongWritable, Text, Text, IntWritable> {
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, Text, IntWritable>.Context context)
throws java.io.IOException, InterruptedException {
//读取一行
String line = value.toString();
//构造字符串分析器
StringTokenizer st = new StringTokenizer(line);
while (st.hasMoreElements()) {
String token = st.nextToken();
if (!isWord(token)) {
//不是单词
continue;
}
//这个单词出现了一次
context.write(new Text(token), new IntWritable(1));
}
};
}
/**
* reducer
* @author thinking4java
*
*/
static class WordCounterReducer extends
Reducer<Text, IntWritable, Text, IntWritable> {
protected void reduce(Text key, java.lang.Iterable<IntWritable> values,
Reducer<Text, IntWritable, Text, IntWritable>.Context context)
throws java.io.IOException, InterruptedException {
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
context.write(key, new IntWritable(sum));
};
}
/**
* 是否为合法的单词[a-zA-Z]+
* @author thinking4java
* @param input
* @return
*/
static boolean isWord(String input) {
return input.matches("[a-zA-Z]+");
}
}