1.Map阶段的核心处理逻辑需要编写在Mapper中
2.Reduce阶段的核心处理逻辑需要编写在Reducer中
3.将编写的Mapper和Reducer进行组合,组合成一个Job
4.对Job进行设置,设置后运行
准备数据文件
需求:统计文件中单词出现的次数
1.创建WordCountMapper类,继承Mapper,重写map 方法
Mapper<LongWritable, Text, Text, IntWritable> 两两一组,输入,输出
LongWritable (偏移量) Text (一行数据)
Text(key值,单词), IntWritable(value值,数量)
import lombok.SneakyThrows;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;/**
* wordCountMapper
*
* @author
* @description:
* @time: 2021/3/24 15:07
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
/**
* 输出key
*/
private Text out_key = new Text();
/**
* 输出value
* 每个单词出现一次记为1
*/
private IntWritable out_value = new IntWritable(1);@SneakyThrows
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) {
/**
* value为一行数据转成字符串
* hello world text 第一行数据
* 根据"\t"空格分隔
*/
String[] words = value.toString().split("\t");for (String word : words) {
out_key.set(word);
/**
* 写出数据(单词,1)
*/
context.write(out_key, out_value);
}}
}
2.创建WordCountReducer类,继承Reducer,重写reduce方法
import lombok.SneakyThrows; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; /** * @author * @description: * @time: 2021/3/24 15:10 */ public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> { private IntWritable out_value = new IntWritable(); /** * reduce一次处理一组数据,key相同的视为一组 * * @param key * @param values 相当于集合 * @param context */ @SneakyThrows @Override protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) { int sum = 0; /** * 遍历 values ,统计次数 */ for (IntWritable intWritable : values) { sum += intWritable.get(); } out_value.set(sum); //将累加的值写出 context.write(key, out_value); } }
3.创建JOB WordCountDriver
import com.atguigu.demomptest.service.mapper.WordCountMapper; import com.atguigu.demomptest.service.reducer.WordCountReducer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; /** * ①Map阶段的核心处理逻辑需要编写在Mapper中 * ②Reduce阶段的核心处理逻辑需要编写在Reducer中 * ③将编写的Mapper和Reducer进行组合,组合成一个Job * ④对Job进行设置,设置后运行 * * @author * @description: * @time: 2021/3/24 15:22 */ public class WordCountDriver { public static void main(String[] args) throws Exception { Path inputPath = new Path("d:/input/wordcount"); //保证输出目录不存在 Path outputPath = new Path("d:/output/wordcount"); //作为整个Job的配置 //空参表示默认使用本地的文件系统 Configuration conf = new Configuration(); //一定要保证输出目录不存在 FileSystem fs = FileSystem.get(conf); if (fs.exists(outputPath)) { fs.delete(outputPath, true); } // ①创建Job Job job = Job.getInstance(conf); // 将某个类所在地jar包作为job的jar包 job.setJarByClass(WordCountDriver.class); // ②设置Job // 设置Job运行的Mapper,Reducer类型,Mapper,Reducer输出的key-value类型 job.setMapperClass(WordCountMapper.class); job.setReducerClass(WordCountReducer.class); // Job需要根据Mapper和Reducer输出的Key-value类型准备序列化器,通过序列化器对输出的key-value进行序列化和反序列化 // 如果Mapper和Reducer输出的Key-value类型一致,直接设置Job最终的输出类型 job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); // 设置输入目录和输出目录 FileInputFormat.setInputPaths(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); // ③运行Job job.waitForCompletion(true); } }