一、环境:
集群 CDH 6.2.0/CentOS Linux release 7.5.1804 (Core) /非Kerberos
本地 Win10
二、部署本地开发环境:
从githup获取winutils-master包
github地址:https://codeload.github.com/steveloughran/winutils/zip/refs/heads/master
解压 winutils-master.zip,解压后的目录如下(根据自己的实际情况选择一个hadoop版本配置本地环境):
配置本地环境变量:
注意:
HADOOP_USER_NAME 设置为hdfs是因为该用户具有写入hdfs文件系统的权限
HADOOP_HOME和对应的BIN也是必须设置的,本地运行需要hadoop的环境
三、java代码用例(maven项目):
maven版本:apache-maven-3.6.3
jdk版本:jdk-1.8
从集群下载YARN配置,解压并把yarn-conf放置在项目目录下:
maven依赖如下:
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.0.0-cdh6.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.0.0-cdh6.2.0</version>
</dependency>
注:
maven对应CDH包的版本参考:
package com.cdh.pro.guagua.yarn.conf;
import java.io.File;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
/**
* 初始化配置类
*/
public class InitConfiguration {
public static Configuration initConf(String confPath) {
Configuration configuration = new YarnConfiguration();
configuration.addResource(new Path(confPath + File.separator + "core-site.xml"));
configuration.addResource(new Path(confPath + File.separator + "hdfs-site.xml"));
configuration.addResource(new Path(confPath + File.separator + "mapred-site.xml"));
configuration.addResource(new Path(confPath + File.separator + "yarn-site.xml"));
configuration.setBoolean("dfs.support.append", true);
configuration.set("fs.hdfs.impl", "org.apache.hadoop.hdfs.DistributedFileSystem");
configuration.setBoolean("fs.hdfs.impl.disable.cache", true);
return configuration;
}
}
package com.cdh.pro.guagua.yarn;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
/**
* 作业提交
*/
public class InitMapReduceJob {
public static Job initWordCountJob(Configuration conf) {
Job wcjob = null;
try {
conf.setBoolean("mapreduce.app-submission.cross-platform", true); //设置跨平台提交作业
// 设置job所使用的jar包,使用Configuration对象调用set()方法,设置mapreduce.job.jar wcount.jar
// mvn clean package 编译获得 guagua-0.0.1-SNAPSHOT.jar 必须设置,否则CDH执行无法找到对应class
conf.set("mapred.jar", System.getProperty("user.dir") + "/lib/guagua-0.0.1-SNAPSHOT.jar");
// 创建job对象需要conf对象,conf对象包含的信息是:所用的jar包
wcjob = Job.getInstance(conf);
wcjob.setMapperClass(WordCountMapper.class);
wcjob.setReducerClass(WordCountReducer.class);
// wcjob的mapper类输出的kv数据类型
wcjob.setMapOutputKeyClass(Text.class);
wcjob.setMapOutputValueClass(LongWritable.class);
// wcjob的reducer类输出的kv数据类型
// job对象调用setOutputKey
wcjob.setOutputKeyClass(Text.class);
wcjob.setOutputValueClass(LongWritable.class);
FileInputFormat.setInputPaths(wcjob, "/test/input");
FileOutputFormat.setOutputPath(wcjob, new Path("/test/output"));
} catch (Exception e) {
e.printStackTrace();
}
return wcjob;
}
}
package com.cdh.pro.guagua.yarn;
import java.io.IOException;
import org.apache.commons.lang3.StringUtils;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
/**
* map
*/
public class WordCountMapper extends Mapper<LongWritable, Text, Text, LongWritable>{
@Override
protected void map(LongWritable key, Text value,Context context) throws IOException, InterruptedException {
// 获取到一行文件的内容
String line = value.toString();
// 切分这一行的内容为一个单词数组
String[] words = StringUtils.split(line, " ");
// 遍历输出<word,1>
for(String word:words){
context.write(new Text(word), new LongWritable(1));
}
}
}
package com.cdh.pro.guagua.yarn;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/**
* reduce
*/
public class WordCountReducer extends Reducer<Text, LongWritable, Text, LongWritable>{
@Override
protected void reduce(Text key, Iterable<LongWritable> values,Context context)
throws IOException, InterruptedException {
long count = 0;
for(LongWritable value:values){
// 调用value的get()方法将long值取出来
count += value.get();
}
// 输出<单词:count>键值对
context.write(key, new LongWritable(count));
}
}
package com.cdh.pro.guagua.yarn;
import java.io.File;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.Job;
import com.cdh.pro.guagua.yarn.conf.InitConfiguration;
/**
* 执行
*/
public class NonKerberosMRTest {
private static String confPath = System.getProperty("user.dir") + File.separator + "yarndemo"
+ File.separator + "yarn-conf";
public static void main(String[] args) {
try {
System.out.println(confPath);
Configuration conf = InitConfiguration.initConf(confPath);
Job wcjob = InitMapReduceJob.initWordCountJob(conf);
wcjob.setJarByClass(NonKerberosMRTest.class);
wcjob.setJobName("NonKerberosMRTest");
// 调用job对象的waitForCompletion()方法,提交作业。
boolean res = wcjob.waitForCompletion(true);
System.exit(res ? 0 : 1);
} catch (Exception e) {
e.printStackTrace();
}
}
}
CDH集群执行以下操作(建立对应的目录和文件):
[root@virtual-cdh01 app]# vim word.txt
wq dsvr dewfw wq pp qq ww qq pp pp
de de ss aa ss dd
[root@virtual-cdh01 app]# hadoop fs -mkdir /test/input/
[root@virtual-cdh01 app]# hadoop fs -put word.txt /test/input/
打包项目:
mvn clean package
将target目录下的jar包复制到项目的lib目录下:
注意:
打包项目并放置对应的jar包到lib目录是为了CDH集群在运行作业的时候找到对应的class
// mvn clean package 编译获得 guagua-0.0.1-SNAPSHOT.jar 必须设置,否则CDH执行无法找到对应class
conf.set("mapred.jar", System.getProperty("user.dir") + "/lib/guagua-0.0.1-SNAPSHOT.jar");
运行作业:
到CDH集群查看结果:
[root@virtual-cdh01 app]# hadoop fs -ls /test/output/
执行成功!!!