Hadoop学习笔记(三)MapReduce Job
MapReduce是分布式计算框架。要实现怎样的计算统计功能,可在Mapper类与Reduce类中自行覆写定义。
Map
public class WordcountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
/*
String line = value.toString();
String[] words = line.split(" ");
for(String word:words){
context.write(new Text(word), new IntWritable(1));
*/
}
}
}
Reduce
如果reduce任务读取到map任务处理结果是这样的:
(good,1)(good,1)(good,1)(good,1)
当传给reduce方法时,就变为:
key:good
value:(1,1,1,1)
(相同的key会整合处理)
import java.io.IOException;
import java.util.Iterator;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class WordcountReducer extends Reducer<Text, IntWritable, Text, IntWritable>{
@Override
protected void reduce(Text key, Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
/*
int count = 0;
Iterator<IntWritable> iterator = values.iterator();
while(iterator.hasNext()){
IntWritable value = iterator.next();
count += value.get();
}
context.write(key, new IntWritable(count));
}
*/
}
Job
package cn.edu360.mr.wc;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobSubmitterLinuxToYarn {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
conf.set("fs.defaultFS", "hdfs://hdp-01:9000");//指定默认文件系统
conf.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmitterLinuxToYarn.class);//当前驱动类
job.setMapperClass(WordcountMapper.class);//设置mapper类
job.setReducerClass(WordcountReducer.class);//设置 Reduce类
job.setMapOutputKeyClass(Text.class);//设置map输出的key,value类型
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);/设置reduce输出的key,value类型
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("/wordcount/input"));//待计算统计的文件所在路径
FileOutputFormat.setOutputPath(job, new Path("/wordcount/output"));//计算统计的结果存储的文件路径
job.setNumReduceTasks(3);
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}