图解
规约主要是为了减少网络传输阶段的负担
准备工作
准备wordcount.txt,最好数据能重复多一点,能看到其中的效果。
规约前
规约后,reduce input明显减少
java代码
WordCountMapper.java
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/*
Mapper泛型:
keyin:k1的类型 行偏移量 LongWritable
valuein:v1的类型 一行文本数据 Text
keyout:k2的类型 每个单词 Text
valueout:v2的类型 固定值 1 LongWritable
*/
public class WordCountMapper extends Mapper<LongWritable, Text,Text,LongWritable> {
/*
map的方法是将k1,v1转化为k2,v2
key:k1
value:v1
Context:Mapreduce的上下文对象
*/
/*
k1 v1
0 hello,world
11 hello,hadoop
---------------------------------------------------------
k2 v2
hello 1
world 1
hello 1
hadoop 1
*/
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
Text text = new Text();
// 对每一行数据字符进行拆分
String line = value.toString();
String[] split = line.split(",");
// 遍历数组,获取每个单词
for (String word : split){ // word 1
text.set(word);
context.write(text,new LongWritable(1));
}
}
}
WordCountReducer.java
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
keyin:k2 Text 每个单词
valuein:v2 LongWritable 集合中泛型的类型 <1,1>
keyout:k3 Text 每个单词
value:v3 Longwritable 每个单词出现的次数
*/
public class WordCountReducer extends Reducer<Text, LongWritable,Text,LongWritable> {
/*
reduce方法是将k2,v2转为k3,v3
key:k2
values:集合
Context:上下文对象
*/
/*
新 k2 v2
hello <1,1>
word <1,1,1>
hadoop <1,1>
-------------------------------------------
k3 v3
hello 2
word 3
hadoop 2
*/
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0; //存储values的值
//1、遍历values集合
for (LongWritable value : values) {
//2、将集合中的值相加
count += value.get();
}
//3、将k3和v3写入上下文中
context.write(key,new LongWritable(count));
}
}
MyCombiner.java,其实就是提前在maptask做了reducer,减少网络传输的压力
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class MyCombiner extends Reducer<Text, LongWritable,Text,LongWritable> {
@Override
protected void reduce(Text key, Iterable<LongWritable> values, Context context) throws IOException, InterruptedException {
long count = 0; //存储values的值
//1、遍历values集合
for (LongWritable value : values) {
//2、将集合中的值相加
count += value.get();
}
//3、将k3和v3写入上下文中
context.write(key,new LongWritable(count));
}
}
JobMain.java
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class JobMain extends Configured implements Tool {
@Override
public int run(String[] strings) throws Exception {
//创建一个任务对象,jobName可以自己定义
Job job = Job.getInstance(super.getConf(),"wordcount"); //通过super.getConf()调用Configuration
//打包放在集群运行时,需要做个配置
job.setJarByClass(JobMain.class);
//第一步:设置文件读取类:k1和v1
job.setInputFormatClass(TextInputFormat.class); //怎么读
TextInputFormat.addInputPath(job,new Path("hdfs://master:9000/wordcount")); //在哪里读
//第二步:设置maper类
job.setMapperClass(WordCountMapper.class); //前面自己写的maper类
//设置map阶段的输出类型:对应k2,v2的类型
job.setMapOutputKeyClass(Text.class); //k2
job.setMapOutputValueClass(LongWritable.class); //v2
//第三、四、五、六步采用默认的方式(分区,排序,归悦,分组)
//设置我们的规约类
job.setCombinerClass(MyCombiner.class);
//第七步:设置reduce类
job.setReducerClass(WordCountReducer.class); //前面自己写的reducer类
//设置reduce阶段的输出类型:对应前面的k3,v3类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(LongWritable.class);
//设置reduce的个数
job.setNumReduceTasks(2);
//第八步:设置输出类
job.setOutputFormatClass(TextOutputFormat.class);
//设置输出的路径
TextOutputFormat.setOutputPath(job,new Path("hdfs://master:9000/out"));
//如果成功返回0,否则1
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static void main(String[] args) throws Exception {
Configuration configuration = new Configuration(); //获取hdfs配置文件
int run = ToolRunner.run(configuration,new JobMain(),args);
System.exit(run);
}
}