1. MapReduce的编写
1.1 pom依赖
<properties>
<!--hadoop的版本-->
<hadoop.version>2.6.4</hadoop.version>
</properties>
<dependencies>
<!-- hadoop的公共组件-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- hadoop的客户端 -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- hdfs -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- yarn的公共组件 -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!---yarn的客户端组件-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!---yarn的服务端端组件-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!--yarn的resourcemanager的组件-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-resourcemanager</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!--yarn的nodemanager的组件-->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-nodemanager</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-yarn-server-applicationhistoryservice</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
1.2 编写Mapper
// 第一个形参:KEYIN,是框架所读一行文本的偏移量Long,使用序列化接口LongWritable
//第二个形参:VALUEIN,是框架所读一行文本的内容,String-->Text
//第三个形参:KEYOUT,输出数据中的key ,Stirng-->Text
//第四个形参:VALUEOUT,输出数据中的value,int--->Intwritable
public class WordCountMapper extends Mapper<LongWritable,Text,Text,IntWritable> {
/**
maptask会对每一行的输入数据调用一次map方法
*/
@Override
protected void map(LongWritable key, Text line, Context context) throws IOException, InterruptedException {
String[] words = line.toString().split(",");
for (String word:words){
//单词作为key,1作为value
context.write(new Text(word),new IntWritable(1));
}
}
}
1.3 编写Reducer
public class WordCountReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable i : values) {
sum += i.get();
}
context.write(key, new IntWritable(sum));
}
}
1.4 编写Job的提交类
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
public class WordCountJob {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//创建作业对象
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//设置map和reduce
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
//设置执行主类
job.setJarByClass(WordCountJob.class);
//设置map的输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//设置reduce的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//设置格式化类
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(TextOutputFormat.class);
//数据的如来源与输出
TextInputFormat.setInputPaths(job, new Path("file:///E:/data/mapreduce/input"));
TextOutputFormat.setOutputPath(job, new Path("file:///E:/data/mapreduce/output"));
//提交作业
job.waitForCompletion(true);
}
}
2. MapReduce的原理
-
mapreduce作业分为两个阶段
- TextInputFormat通过RecordReader类把按换行符来划分,每经过一个换行符就读取一行,把这一行交给map
- map阶段(Mapper)
- reduce阶段(Reducer)
- TextOutputFormat输出结果
3. MapReduce提交到Yarn
3.1 把作业进行打包
- 使用idea自带的打包工具进行打包(不推荐使用,因为我们使用了maven)
- maven的打包插件(推荐使用)
3.2 提交作业
-
任意选择一台安装了hadoop环境的机器即可提交作业(把走也jar包放在一台机器上)
-
把要统计的文件上传到hdfs
-
开始提交
使用 hadoop jar 可以用到hadoop中的jar包
conf.set("fs.defaultFS", "hdfs://ns1");
hadoop jar mapreduce-1.0-SNAPSHOT.jar com.uplooking.bigdata.WordCountJob /hd
fs/input /hdfs/output
4. 提交作业的细节
源码
-
初始化Confiure对象,加载core-dafault.xml,core-site配置文件
-
获取job对象
- new jobconf()–>加载配置文件–>new job
-
设置属性
- 设置 jar
- 设置mapper
- 设置reducer
- 设置输入输出个数
- 设置输入路径
- 设置输出路径
-
waitForCompletion提交
- 判断状态
- 设置新的api—>默认设置reduce的个数(1个)---->判断api–>确定设置属性–>
-
作业提交时会把大文件进行划分,以split为划分单位,一个split默认是一个block的大小.可以改变,但是一般不建议改变split的划分大小的规则
-
一个MapTask对应一个split,即默认一个Block对应一个MapTask
-
ReduceTask的个数默认被设置为1
-
提交作业时,把所有的配置信息写到一个文件中(job.xml)
-
提交作业时,把所有的切片信息写到一个文件(job.split)
5. MapReduce的远程调试
5.1 在IDEA添加远程调试
5.2 拷贝idea的参数,定义临时环境变量
export HADOOP_CLIENT_OPTS="$HADOOP_CLIENT_OPTS -agentlib:jdwp=transport=dt_socket,server=y,suspend=y,address=5005"