一、学习目标
理解MapReduce设计思想
熟练掌握MapReduce分布式计算的基本原理
掌握使用java进行MapReduce编程
掌握在Hadoop集群中提交MapReduce任务
HDFS读文件
HDFS写文件
HDFS文件格式
二、MapReduce的设计思想
分而治之:简化并行计算的编程模型
构建抽象模型 :开发人员专注于实现Mapper和Reducer函数
隐藏系统层细节:开发人员专注于业务逻辑实现
三、MapReduce特点
优点:易于编程,可扩展性,高容错性,高吞吐量
不适用领域:难以实时计算(迅速做出反应),不适合流式计算,不适合DGA(有向图)计算
四、MapReduce编程规范
五、编写mapper
1.导入依赖
<!-- hadoop依赖-->
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.6.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>2.6.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>2.6.0</version>
<!-- <scope>test</scope>-->
</dependency>
<!-- MapReduce-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>2.6.0</version>
<!-- <type>pom</type>-->
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-mapreduce-client-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-common</artifactId>
<version>2.6.0</version>
</dependency>
2.编写mapper类
(1)继承
org.apache.hadoop.mapreduce.Mapper
(2)分割 map方法的重写
package com.njbdqn.mywz.wcserver;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/*
LongWritable, Text, Text, IntWritable
前两个是进来的参数,后面两个是出来的参数
进出都是键值对
*/
public class wcMapper extends Mapper<LongWritable, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
/**
* 分割
* @param key
* @param value
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void map(LongWritable key, Text value, Mapper<LongWritable, Text, Text, IntWritable>.Context context) throws IOException, InterruptedException {
//分割单词
/*
* hello hadoop hello hadoop java
* hello 1
* hadoop 1
* hello 1
* hadoop 1
* java 1
* */
String[] words = value.toString().split("\\s+");
//这边不负责聚合。由MapReduce
for (String word :
words) {
context.write(new Text(word), one);//想内存表 添加一条数据 key value
}
}
/*
* hello ==> hello list[1,1]
* hadoop ==> hadoop list[1,1]
* java ==> java list[1]
* */
}
3.编写Reduce类
(1)继承
org.apache.hadoop.mapreduce.Reducer
(2)聚合 reduce方法的重写
package com.njbdqn.mywz.wcserver;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
/*
Text, IntWritable,Text,IntWritable
前两个是进来的参数,后面两个是出来的参数
这边前两个的进相当于mapper里面的出的两个参数
进出都是键值对
*/
public class WcReduce extends Reducer<Text, IntWritable,Text,IntWritable> {
/**
* 聚合
* @param key
* @param values
* @param context
* @throws IOException
* @throws InterruptedException
*/
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Reducer<Text, IntWritable, Text, IntWritable>.Context context) throws IOException, InterruptedException {
int num = 0;
for (IntWritable n :
values) {
num += n.get();//获取IntWritable中的基本类型的数据
}
context.write(key,new IntWritable(num));
}
}
4.编写启动类APP
一定要注意自己导入的包!!!!!
package com.njbdqn.mywz.wcserver;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WcApp {
public static void main(String[] args) throws Exception {
//产生1个job任务 注意导入的包是 mapreduce下的
Job job = Job.getInstance(new Configuration(), "wc");
//设置当前类未Jar包的引导主类
job.setJarByClass(WcApp.class);
//设置你读取文件的位置 注意导入的包是 mapreduce下的
FileInputFormat.setInputPaths(job,new Path("d:/English.txt"));
//设置结果存放路劲文件夹 一定要是空的 否则输出不了
FileOutputFormat.setOutputPath(job,new Path("d:/hahaha"));
//把自己的mapper嵌入到MapMapper中
job.setMapperClass(WcMapper.class);
//把自己的mapper类的输出类型通知框架
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//把自己的reduce嵌入到MapReduce中
job.setReducerClass(WcReduce.class);
//吧自己的reduce的输出类型通知框架
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//启动框架运行
job.waitForCompletion(true);
}
}