下面是使用一种新的方式去实现wordCount(全局计数器)
package mapreduce.wc;
/**
* 描述:
* 使用一种新的方式去实现wordCount
* 全局计数器
*/
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import model.ModelMR;
public class WCMR extends Configured implements Tool {
/**
* job对象的初始化和设置相关的代码全部编写在run方法中
*/
@Override
public int run(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://hadoop01:9000");
System.setProperty("HADOOP_USER_NAME", "hadoop");
Job job = Job.getInstance(conf);
job.setJarByClass(WCMR.class);
job.setMapperClass(ModelMRMapper.class);
job.setNumReduceTasks(0);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(NullWritable.class);
/**
* 设置输入输出
*/
Path inputPath = new Path(args[0]);
Path outputPath = new Path(args[1]);
FileInputFormat.setInputPaths(job, inputPath);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outputPath)){
fs.delete(outputPath,true);
}
FileOutputFormat.setOutputPath(job, outputPath);
/**
* 提交任务
*/
boolean isDone = job.waitForCompletion(true);
return isDone ? 0 : 1;
}
/**
* 描述: 定义全局计数器
*/
enum Words{
WORD_COUNT,LINE_COUNT
}
public static void main(String[] args) throws Exception {
/**
* 底层原理就是:
* 调用第一个参数对象的run方法
*/
int run = ToolRunner.run(new WCMR(), args);
System.exit(run);
}
private static class ModelMRMapper extends Mapper<LongWritable, Text, NullWritable, NullWritable>{
/**
* 当前的两个实例对象就是全局计数器
* 其实就是整个MapReduce程序中公用的一个全局变量
*/
private Counter wordsCounter;
private Counter linesCounter;
/**
* 当前的countLine 仅仅只是当前mapTask中的一个计数器
*/
private long countLine = 0;
@Override
protected void setup(Context context)
throws IOException, InterruptedException {
System.out.println("***********setup******************");
wordsCounter = context.getCounter(Words.WORD_COUNT);
linesCounter = context.getCounter(Words.LINE_COUNT);
/**
* 当前这个context对象就是一个MapContext的一个实现类MapContextImpl这个类的一个实例对象
*
* 当前这个对象中包含了当前这个MapTask所有的各种必要信息
*/
InputSplit inputSplit = context.getInputSplit();
FileSplit fileSplit = (FileSplit)inputSplit;
/**
* 存储在HDFS中的文件路径
*/
Path path = fileSplit.getPath();
String name = path.getName();
System.out.println("@@@@@@@@@@@@@ "+path);
}
@Override
protected void map(LongWritable key, Text value,
Mapper<LongWritable, Text, NullWritable, NullWritable>.Context context)
throws IOException, InterruptedException {
countLine++;
String[] split = value.toString().split(" ");
int number = split.length;
/**
* 当前Counter对象中的increment方法的作用就相当于:
*
* WordsCounter += number;
*/
wordsCounter.increment(number);
}
@Override
protected void cleanup(Mapper<LongWritable, Text, NullWritable, NullWritable>.Context context)
throws IOException, InterruptedException {
System.out.println("---------------cleanup--------------");
/**
* 这句代码的意思就是 使用 全局计数器 liesCounter 然后去累加上所有的mapTask的总行数
*
* 如果有10个mapTask,那么该linesCounter 就会 累加10次。
*
* 全局计数器 就是 最终的结果
*/
linesCounter.increment(countLine);
}
}
}