步骤
1)创建wordcount类继承configured,实现tool接口
2)实现mapper内部类
3)实现reducer内部类
4)设置Job相关信息
5)提交job运行
代码实现
package com.hainiuxy;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 统计文本文件单词的个数
*/
public class WordCount extends Configured implements Tool{
/*
* ********输入类型的确定,跟输入的Format类有关系*******
* public class TextInputFormat extends FileInputFormat<LongWritable, Text>
* 有个 createRecordReader() 返回值类型是 return new LineRecordReader(recordDelimiterBytes);
*
* public class LineRecordReader extends RecordReader<LongWritable, Text>
*
* keyin:LongWritable
* valuein:Text
*
对于文本来说,
keyIn: 行字节的偏移量, long类型,封装类是:LongWritable
one world
one dream
kyein valuein
0 one world
10 one dream
VALUEIN:一行数据 , String类型,封装类是:Text
*****下面两个类型的确定是跟业务有关系
KEYOUT:单词, Text
VALUEOUT:数量, LongWritable
*/
public static class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
/**
* map输出的key: 单词
*/
Text keyOut = new Text();
/**
* map 输出 的value : 数值
* 封装数值
*/
LongWritable valueOut = new LongWritable(1L);
//map():每行调用一次
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
System.out.println("------------------------------");
//one world
String line = value.toString();
System.out.println("map input:" + key.get() + ", " + line);
//[one, world]
String[] splits = line.split(" ");
for(String word : splits){
//封装单词
keyOut.set(word);
//输出 <单词,1>的形式
context.write(keyOut, valueOut);
System.out.println("map output:" + word + ", " + valueOut.get());
}
}
}
/*
* keyIn, VALUEIN:map输出什么类型,reduce就输入什么类型
*
* KEYOUT,VALUEOUT:是跟业务有关系的
* KEYOUT:单词, Text
* VALUEOUT:数量, LongWritable
*/
public static class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
/**
* reduce输出的value:最终单词统计的结果
* 封装数值
*/
LongWritable valueOut = new LongWritable();
//reduce():一个key调用一次
@Override
//在这里,reduce步的输入相当于<单词,valuelist>,如<Hello,<1,1>>
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
System.out.println("-------------------------");
//key:one
//values:[1,1,1]
//用户累加
long sum = 0L;
StringBuilder sb = new StringBuilder();
//one,[1,1,1]
sb.append("reduce input:" + key.toString() + ", [");
for(LongWritable w : values){
sb.append(w.get()).append(",");
sum += w.get();
}
sb.deleteCharAt(sb.length() - 1).append("]");
System.out.println(sb.toString());
//valueOut.set(sum);
//输出统计结果
context.write(key, new LongWritable(sum));
System.out.println("reduce output:" + key.toString() + ", " + valueOut.get());
}
}
@Override
public int run(String[] args) throws Exception {
//获取Configuration对象
Configuration conf = getConf();
//创建job对象和job的名字wordcount
Job job = Job.getInstance(conf, "wordcount");
//设置job参数
//设置job运行类
job.setJarByClass(WordCount.class);
//设置任务mapper运行类
job.setMapperClass(WordCountMapper.class);
//设置任务reducer运行类
job.setReducerClass(WordCountReducer.class);
// 【默认就一个reduce】如果默认,不需要设置,只有reduce个数!=1时设置,
// 设置2个reduce
job.setNumReduceTasks(2);
//设置任务mapper输出的key的类型
job.setMapOutputKeyClass(Text.class);
//设置任务mapper输出的value的类型
job.setMapOutputValueClass(LongWritable.class);
//设置最终输出的key类型
job.setOutputKeyClass(Text.class);
//设置最终输出的value类型
job.setOutputValueClass(LongWritable.class);
// 设置输入的格式:【默认是TextInputFormat.class】如果是文本,可以不写;如果是其他的就必须设置此项
job.setInputFormatClass(TextInputFormat.class);
// 【默认是TextOutputFormat.class】如果是文本,可以不写;如果是其他的就必须设置此项
job.setOutputFormatClass(TextOutputFormat.class);
//设置任务的输入目录
FileInputFormat.addInputPath(job, new Path(args[0]));
//输出目录Path对象
Path outputDir = new Path(args[1]);
//设置任务的输出目录
FileOutputFormat.setOutputPath(job, outputDir);
//自动删除输出目录
FileSystem fs = FileSystem.get(conf);
if(fs.exists(outputDir)){
//递归删目录
fs.delete(outputDir, true);
System.out.println("output dir【" + outputDir.toString() + "】 is deleted");
}
//运行job任务, 阻塞的方法
//boolean status = job.waitForCompletion(true);
//执行job
return (job.waitForCompletion(true)) ? 0 : 1;
}
public static void main(String[] args) throws Exception {
// /tmp/mr/input /tmp/mr/output
System.exit(ToolRunner.run(new WordCount(), args));
}
}