关于MapReduce的java代码开发,模式比较固定;基本分为三个部分,Mapper、Reducer以及主入口配置;已经典的词频统计案例来说步骤如下
1 Mapper代码:
package com.hadoop.mr.wc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
* Created by 86186 on 2019/8/31.
* KEYIN:Map任务读取数据的key类型,offset,是每行数据起始位置的偏移量,Long
* LongWritable 输入的类型
* Text 输入的内容
* Text 输出类型
* IntWritable 输出内容
*/
public class WordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable>{
/**
*
* @param key 输入类型
* @param value 输入内容-这里的value 是整个文本经过split之后的内容,可以理解为每一行的内容;
* @param context 输出上下文
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//读取value
String[]words=value.toString().split("\\PL+");//切割单词
for(String word:words){
word=word.toLowerCase();
context.write(new Text(word),new IntWritable(1));//
}
}
}
2 Reducer代码:
package com.hadoop.mr.wc;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
/**
* Created by 86186 on 2019/8/31.
* Text: Reducer输入类型-对应Mapper 输出
* IntWritable:Reducer输入内容-对应Mapper 输出
* Text: Reducer输出类型
* IntWritable:Reducer输出内容
*/
public class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable>{
/**
*
* @param key 输入类型 与 Mapper输出对应
* @param values 输入内容 是经过 shuffle之后的内容
* @param context 上下文
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
/**
* 这里的values 已经是根据 key 汇总好的了。只需要做一个累加即可
*/
int count=0;
Iterator<IntWritable>iterator=values.iterator();
while(iterator.hasNext()){
IntWritable value=iterator.next();
count+=value.get();
}
System.out.println("key:"+key.toString());
context.write(key,new IntWritable(count));
}
}
3 入口配置
package com.hadoop.mr.wc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.net.URI;
/**
* Created by 86186 on 2019/8/31.
*/
public class WordCountApp {
public static void main(String[] args) throws Exception{
System.setProperty("HADOOP_USER_NAME","hadoop");//用户
Configuration configuration =new Configuration();
configuration.set("fs.defaultFS","hdfs://192.168.0.120:8020");//url
//创建一个Job
Job job =Job.getInstance(configuration);
job.setJarByClass(WordCountApp.class);//配置JarClass
job.setMapperClass(WordCountMapper.class);//配置Mapper
job.setReducerClass(WordCountReducer.class);//配置Reducer
job.setMapOutputKeyClass(Text.class);//配置Mapper输出 key
job.setMapOutputValueClass(IntWritable.class);//配置Mapper输出 value
job.setOutputKeyClass(Text.class);//配置Reducer输出 key
job.setOutputValueClass(IntWritable.class);//配置Reducer输出 value
/*
输出文件如果存在,先删除;否则会报错
*/
FileSystem fileSystem= FileSystem.get(new URI("hdfs://192.168.0.120:8020"),configuration,"hadoop");
Path outputPath=new Path("/wordcount/output");
if(fileSystem.exists(outputPath)){
fileSystem.delete(outputPath,true);
}
FileInputFormat.setInputPaths(job,new Path("/wordcount/input/h.txt"));//配置输入文件
FileOutputFormat.setOutputPath(job,new Path("/wordcount/output"));//配置输出文件
boolean result=job.waitForCompletion(true);//运行
System.exit(result ? 0:-1);
}
}
3.1 查看h.txt文件内容;
3.2 运行程序,查看执行结果:
4 如果只是测试MapperReducer功能;也可以使用本地环境,不需要配置HDFS环境;
代码如下(只需要修改入口代码即可)
package com.hadoop.mr.wc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* Created by 86186 on 2019/8/31.
*/
public class WordCountAppLocal {
public static void main(String[] args) throws Exception{
Configuration configuration =new Configuration();
//创建一个Job
Job job =Job.getInstance(configuration);
job.setJarByClass(WordCountAppLocal.class);
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job,new Path("D:\\F\\hadoop\\mr\\input\\a.txt"));
FileOutputFormat.setOutputPath(job,new Path("D:\\F\\hadoop\\mr\\out5"));
boolean result=job.waitForCompletion(true);
System.exit(result ? 0:-1);
}
}
4.1 文件内容:
4.2 执行结果:
5 总结,基本流程