-
MapReduce编程主要组件
InputFormat类:分割成多个splits和每行怎么解析。
Mapper类:对输入的每对<key,value>生成中间结果。
Combiner类:在map端,对相同的key进行合并。
Partitioner类:在shuffle过程中,将按照key值将中间结果分为R份,每一份都由一个reduce去完成。
Reducer类:对所有的map中间结果,进行合并。
OutputFormat类:负责输出结果格式。 -
执行原理图
-
代码示例
引入依赖包
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.4.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.4.0</version>
</dependency>
Mapper类代码
package cn.demo.myfriend.data;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
*
* @author zhongyulin
* LongWritable 输入的偏移量
* Text 输入的数据
* Text 输出的key
* IntWritable 输出的value
*/
public class MyMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one=new IntWritable(1);
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line=value.toString();
//按规则拆分成数组
String[] arry=line.split(",");
String keyout=arry[1];
context.write(new Text(keyout),one);
}
}
Reducer代码类
package cn.demo.myfriend.data;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* Reducer模块
* @author zhongyulin
*
*/
public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result=new IntWritable();
@Override
protected void reduce(Text key, Iterable<IntWritable> values,
Context context) throws IOException, InterruptedException {
int sum=0;
for (IntWritable val : values) {
sum +=val.get();
}
result.set(sum);
context.write(key, result);
}
}
程序启动入口
package cn.demo.myfriend.data;
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JobMain {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf =new Configuration();
//创建任务
Job job= Job.getInstance(conf);
//指定执行的map和Reduce的类
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
// 指定map和任务输出的类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置文件输出路径
FileInputFormat.addInputPath(job,new Path("hdfs://172.80.2.207:9000/test/user_login.txt"));
FileOutputFormat.setOutputPath(job,new Path("hdfs://172.80.2.207:9000/test/dataLogin"));
boolean flag=job.waitForCompletion(true);
}
}
数据示例格式
Nehru,2016-01-01
Dane,2016-01-01
Walter,2016-01-01
Gloria,2016-01-01
Clarke,2016-01-01
Madeline,2016-01-01
Kevyn,2016-01-01
Rebecca,2016-01-01
Calista,2016-01-01
Lana,2016-01-01
Phoebe,2016-01-01
Clayton,2016-01-01
Kimberly,2016-01-01
Drew,2016-01-01
Giselle,2016-01-01
Nolan,2016-01-01
Madeson,2016-01-01
Janna,2016-01-01
Raja,2016-01-01
Merrill,2016-01-01
Aquila,2016-01-01
Idona,2016-01-01
Connor,2016-01-01
acqueline,2016-01-01
Shaeleigh,2016-01-01
Bevis,2016-01-01
Howard,2016-01-01
Sylvia,2016-01-01
Molly,2016-01-01
Julie,2016-01-01
Hedy,2016-01-01
Kuame,2016-01-01
Kerry,2016-01-01
Burton,2016-01-01
Abra,2016-01-01