Hadoop自己写的MapReduce WordCount程序
以下是我写的WordCount程序,代码的具体讲解已经在注释中给出,所以就不做赘述,我再学习的时候写WordCount程序是用的MapReduce八股文,非常好用,先写出一个框架,再向框架中添加血肉:
java
package org.hadoop.MapReduce;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Mapper.Context;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 自己的WordCount类 MapReduce八股文
* @author Troy
*
*/
public class MyWordCount {
//map类
/**
* 输入是一个文本,所以以偏移量为键值,value为这一行的值
* 每一行都执行一次map, 所以map里的value代表一行
*
*/
static class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
//代表word出现一次
private final static IntWritable one = new IntWritable(1);
//这里为什么要声明这个word呢,我猜是因为声明一个static的可以避免每个类对象声明的时候都要重复创建,比较费劲,这样只要就可以值创建一个,一直用了
private static Text word = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
//解析 获取改行的字符串
String line = value.toString();
//将字符串分割
StringTokenizer stringTokenizer = new StringTokenizer(line);
//循环计数
while(stringTokenizer.hasMoreTokens()){
String wordValue = stringTokenizer.nextToken();
word.set(wordValue);
//把输出结果写出来
context.write(word, one);
}
}
}
//reduce类
/**
* reduce函数的作用就是将每个key所对应的value值都相加起来
* 又因为这里的value都是1,我理解的是每个key都会执行一次reduce函数
* @author Troy
*
*/
static class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
//这里为什么要设置这个东西呢,我才也是跟之前的原因是一样的
//都是为了解决重复声明浪费资源的问题
private static IntWritable count = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
int sum = 0;
//将所有为该key的value全部相加起来
for(IntWritable value:values){
sum += value.get();
}
//把输出结果写出来
count.set(sum);
context.write(key, count);
}
}
//客户端
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException{
//获取配置信息
Configuration configuration = new Configuration();
//声明job
Job job = new Job(configuration, "MyWordCount");
//1.设置执行的类
job.setJarByClass(MyWordCount.class);
//2.设置Mapper类和Reducer类
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
//3.设置输入文件输出文件路径
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
//4.设置输出结果的key value 类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//5.提交job, 等待运行结果,并在客户端显示运行信息
boolean isSuccess = job.waitForCompletion(true);
//结束
System.exit(isSuccess?0:1);
}
}