Mapper类
将MapTask传入的文本内容按行读入转换成String
根据划分方式将这一行切分成单个单词
将单词输出为 <单词,1>
1)读入转化为string:abc abc abc
2)切分
abc
abc
abc
3)输出:(abc,1)、(abc,1)、(abc,1)
package com.mapreduce.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
//KEYIN,map阶段输入的key的类型:LongWritable
//VALUE,map阶段输入的value类型:Text
//KEYOUT,map阶段输出的key类ing:Text
//VALUEOUT,map阶段输出的value类型:IntWritable
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private Text outK = new Text();
private IntWritable outV = new IntWritable(1);//初始化为1,不聚合
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//获取一行 (abc abc)
String line = value.toString();
//切割 根据文本(abc
// abc)
String[] words = line.split(" ");
//循环写出
for (String word : words) {
//封装
outK.set(word);
//写出
context.write(outK, outV);
}
}
}
Reducer类
汇总各个key的个数, 输出该key的总次数。
1)汇总 (abc,1)、(abc,1)、(abc,1)
2)输出key总次数 (abc,3)
package com.mapreduce.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
//KEYIN,reduce阶段输入的key的类型:Text
//VALUE,reduce阶段输入的value类型:IntWritable
//KEYOUT,reduce阶段输出的key类ing:Text
//VALUEOUT,reduce阶段输出的value类型:IntWritable
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
IntWritable outV = new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//xx(1,1)
//累加
int sum = 0;
for (IntWritable value : values) {
sum += value.get();
}
outV.set(sum);
//写出
context.write(key, outV);
}
}
Driver类
获取配置信息,获取job对象实例
关联Mapper/Reducer业务类
指定Mapper输出数据的kv类型
指定最终输出的数据的kv类型
指定job的输入原始文件所在目录
指定job的输出结果所在目录
指定本程序的jar包所在的本地路径
提交
package com.mapreduce.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordCountDriver {
public static void main(String[] args) throws IOException, InterruptedException, ClassNotFoundException {
//获取job
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//设置jar包路径
job.setJarByClass(WordCountDriver.class);
//关联mapper和reducer
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//设置map输出的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置最终输出的kv类型
job.setOutputKeyClass(Text.class);
job.setOutputKeyClass(IntWritable.class);
//设置输入路径和输出路径
FileInputFormat.setInputPaths(job, new Path("D:\\input"));
FileOutputFormat.setOutputPath(job, new Path("D:\\output\\output4"));
//提交job
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
根据源码,将verbose置为true,可以收集更多信息