MapReduce运行Yarn上详解
(1)Client向Yarn主节点RM提交应用
bin/yarn jar MainClass args
(2)RM在某个NM节点上启动一个Container运行AppMaster,运行应用的管理者
Container容器:将资源(CPU和memory)进行隔离,供单独个某个Task独立使用
(3)AppMaster向RM请求资源,为了运行MapReduce中所有的Task,RM将分配NM是哪个资源,并且告知AppMaster
(4)AppMaster联系NM,启动Container中相关Task(Map Task和Reduce Task)
(5)运行的Task会实时的向AppMaster进行汇报,监控整个应用。
(6)当所有Task(Reduce Task)运行完成,appMaster告知RM,销毁AppMaster
(7)RM给Client相应
MapReduce编程
MapReduce处理数据流程
在整个MapReduce程序中,所有的数据的流程流式都是键值对(Key-value)
Input -> Map ->shuffle->Reduce ->Output
(1)针对于Input和Output来讲,正常情况下,不需要编写任何的代码,只需要指定对应目录即可。
(2)核心关注map和reduce
input环节:
输入:读取HDFS上数据
输出: Key value
行偏移量 行内容
Mapper环节
<输入Key,输入Value,输出Key,输出Value>
<行偏移量,行内容,输出key,输出value>
map要干什么:处理数据
例如:通过 空格 分割,取出里面的单词
输出: key value
数据 1
shuffle环节:
功能:
分区:hadoop的map/reduce中支持对key进行分区,从而让map出来的数据均匀分布在reduce上
分组:会将相同Key 的value放到一个集合中
排序:按照字典顺序排序
Reduce环节:
<数据 ,1,数据 ,频率>
处理:将集合里面的值拿出来相加
输出: key value
数据 数据频率
Output环节:
输入: key value
数据 数据频率
输出:将内容写到HDFS文件中
编程实现:单词出现频率,Maven工程
主类
package com.huadian.bigdata.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class WordCountMapReduce {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
//(1)读取配置文件
Configuration configuration = new Configuration();
//(2)创建Job
//Job getInstance(Configuration conf, String jobName)
Job job = Job.getInstance( configuration, "WordCountMapReduce" );
//设置Job运行的主类
job.setJarByClass( WordCountMapReduce.class );
//(3)设置job
//(3.1)input
Path inputPath = new Path( args[0] );
FileInputFormat.setInputPaths( job,inputPath );
//(3.1)map
job.setMapperClass( WordCountMapper.class );
job.setMapOutputKeyClass( Text.class );
job.setMapOutputValueClass( IntWritable.class );
//(3.1)shuffle
//(3.1)reduce
job.setReducerClass( WordCountReducer.class );
job.setOutputKeyClass( Text.class );
job.setOutputValueClass( IntWritable.class );
//(3.1)output
Path outputPath = new Path( args[1] );
FileOutputFormat.setOutputPath( job,outputPath );
//(4)提交job,去运行
//print the progress to the user
boolean isSuccess = job.waitForCompletion( true );
System.exit( isSuccess?0:1 );
}
/**
* map方法
* KeyIn:输入Key的类型
* 文本行偏移量,使用Long类型表示
* ValueIn:输入Value的类型
* 文本中,每一行的内容,使用String表示
* KeyOut:输出key的类型
* 单词
* ValueOut:输出Value的类型
* 单词对应频率
*/
private static class WordCountMapper extends Mapper<LongWritable, Text, Text, IntWritable>{
private Text mapOutKey = new Text( );
private final static IntWritable mapOutValue = new IntWritable( 1 );
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//需要将行内容转成一个一个单词:
String row = value.toString();//行内容hadoop java spring springMvc
String[] strs = row.split( " " );
for (String str:strs) {
mapOutKey.set( str );
//借助context将Map方法结果进行输出
context.write( mapOutKey,mapOutValue );
}
}
}
private static class WordCountReducer extends Reducer<Text, IntWritable,Text, IntWritable>{
private IntWritable outputValue = new IntWritable( );
@Override
protected void reduce(Text key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException {
int sum = 0;
//将集合中 值相加
for (IntWritable value:values) {
sum += value.get();
}
outputValue.set( sum );
context.write( key,outputValue );
}
}
}
POM.xml
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.huadian.bigdata</groupId>
<artifactId>hadoop</artifactId>
<version>1.0-SNAPSHOT</version>
<repositories>
<repository>
<id>aliyun</id>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
</repository>
<repository>
<id>cloudera</id>
<url>https://repository.cloudera.com/artifactory/cloudera-repos/</url>
</repository>
<repository>
<id>jboss</id>
<url>http://repository.jboss.com/nexus/content/groups/public</url>
</repository>
</repositories>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.7</maven.compiler.source>
<maven.compiler.target>1.7</maven.compiler.target>
<hadoop.version>2.7.3</hadoop.version>
</properties>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
</dependencies>
<build>
<pluginManagement><!-- lock down plugins versions to avoid using Maven defaults (may be moved to parent pom) -->
<plugins>
<plugin>
<artifactId>maven-clean-plugin</artifactId>
<version>3.0.0</version>
</plugin>
<!-- see http://maven.apache.org/ref/current/maven-core/default-bindings.html#Plugin_bindings_for_jar_packaging -->
<plugin>
<artifactId>maven-resources-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.7.0</version>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.20.1</version>
</plugin>
<plugin>
<artifactId>maven-jar-plugin</artifactId>
<version>3.0.2</version>
</plugin>
<plugin>
<artifactId>maven-install-plugin</artifactId>
<version>2.5.2</version>
</plugin>
<plugin>
<artifactId>maven-deploy-plugin</artifactId>
<version>2.8.2</version>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>