MaprRduce v2 在 java 代码中远程提交作业到 Yarn 的配置项

MaprRduce v2 在 java 代码中远程提交作业到 Yarn 的配置项,最后编辑于 2021-12-22

import org.apache.hadoop.fs.CommonConfigurationKeys;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.mapreduce.MRConfig;
import org.apache.hadoop.mapreduce.MRJobConfig;
import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig;
import org.apache.hadoop.yarn.conf.YarnConfiguration;

	// hbase.zookeeper.quorum
	conf.set(HConstants.ZOOKEEPER_QUORUM, "zkserver01,zkserver02,zkserver03:2181");
	// zookeeper.session.timeout
	conf.set(HConstants.ZK_SESSION_TIMEOUT, "60000");

	// yarn.resourcemanager.scheduler.address
	conf.set(YarnConfiguration.RM_SCHEDULER_ADDRESS, "rmserver:8030");
	// yarn.resourcemanager.resource-tracker.address
	conf.set(YarnConfiguration.RM_RESOURCE_TRACKER_ADDRESS, "rmserver:8031");
	// yarn.resourcemanager.address
	conf.set(YarnConfiguration.RM_ADDRESS, "rmserver:8032"); // see step 3
	// yarn.resourcemanager.admin.address
	conf.set(YarnConfiguration.RM_ADMIN_ADDRESS, "rmserver:8033");
	// yarn.resourcemanager.webapp.address
	conf.set(YarnConfiguration.RM_WEBAPP_ADDRESS, "rmserver:8088");
	// yarn.resourcemanager.webapp.https.address
	conf.set(YarnConfiguration.RM_WEBAPP_HTTPS_ADDRESS, "rmserver:8090");
	// mapreduce.framework.name
	conf.set(MRConfig.FRAMEWORK_NAME, "yarn");
	// fs.defaultFS
	conf.set(CommonConfigurationKeys.FS_DEFAULT_NAME_KEY, "hdfs://hadoopserver:8020"); // see step 2
	// MapReduce JobHistory Server地址
	// mapreduce.jobhistory.address
	conf.set(JHAdminConfig.MR_HISTORY_ADDRESS, "rmhisserver:10020");
	// History Server的管理地址
	// mapreduce.jobhistory.admin.address
	conf.set(JHAdminConfig.JHS_ADMIN_ADDRESS, "rmhisserver:10033");
	// MapReduce JobHistory Server Web UI地址
	// mapreduce.jobhistory.webapp.address
	conf.set(JHAdminConfig.MR_HISTORY_WEBAPP_ADDRESS, "rmhisserver:19888");
	// mapreduce.jobhistory.webapp.https.address
	conf.set(JHAdminConfig.MR_HISTORY_WEBAPP_HTTPS_ADDRESS, "rmhisserver:19890");

	// yarn.app.mapreduce.am.staging-dir
	conf.set(MRJobConfig.MR_AM_STAGING_DIR, "/user");
	// mapreduce.map.java.opts
	conf.set(MRJobConfig.MAP_JAVA_OPTS, " -Xmx1638m");
	// mapreduce.map.memory.mb
	conf.set(MRJobConfig.MAP_MEMORY_MB, "2048");
	// mapreduce.reduce.java.opts
	conf.set(MRJobConfig.REDUCE_JAVA_OPTS, " -Xmx1638m");
	// mapreduce.reduce.memory.mb
	conf.set(MRJobConfig.REDUCE_MEMORY_MB, "2048");

	// mapreduce.job.reduce.slowstart.completedmaps
	conf.set(MRJobConfig.COMPLETED_MAPS_FOR_REDUCE_SLOWSTART, "0.8");
	// mapreduce.task.io.sort.factor
	conf.set(MRJobConfig.IO_SORT_FACTOR, "64");
	// mapreduce.task.io.sort.mb
	conf.set(MRJobConfig.IO_SORT_MB, "256");
	// mapreduce.map.output.compress
	conf.set(MRJobConfig.MAP_OUTPUT_COMPRESS, "true");
	// mapreduce.map.output.compress.codec
	conf.set(MRJobConfig.MAP_OUTPUT_COMPRESS_CODEC, "org.apache.hadoop.io.compress.SnappyCodec");
	// mapreduce.reduce.shuffle.parallelcopies
	conf.set(MRJobConfig.SHUFFLE_PARALLEL_COPIES, "10");

	// yarn.application.classpath
	conf.set(YarnConfiguration.YARN_APPLICATION_CLASSPATH,
		"$HADOOP_CONF_DIR,$HADOOP_COMMON_HOME/*,$HADOOP_COMMON_HOME/lib/*,"
		+ "$HADOOP_HDFS_HOME/*,$HADOOP_HDFS_HOME/lib/*,$HADOOP_MAPRED_HOME/*,$HADOOP_MAPRED_HOME/lib/*,"
		+ "$HADOOP_YARN_HOME/*,$HADOOP_YARN_HOME/lib/*");
	
	job.setNumReduceTasks(100);

参考资料
1. Map-side Join Vs. Join | Edureka Blog . https://www.edureka.co/blog/map-side-join-vs-join/
2. MapReduce Example | Reduce Side Join MapReduce Example | Edureka. https://www.edureka.co/blog/mapreduce-example-reduce-side-join/
3. Hadoop MapReduce Join & Counter with Example. https://www.guru99.com/introduction-to-counters-joins-in-map-reduce.html
4. 多个MapReduce之间的嵌套_yanzhelee-CSDN博客. https://blog.csdn.net/u010521842/article/details/75042771
5. MapReduce和HBase集成(Apache版本和CDH版本) - 陈小哥cw - 博客园 . https://www.cnblogs.com/chenxiaoge/p/13335436.html
6. java - HBase multiple table scans for the job - Stack Overflow. https://stackoverflow.com/questions/14077766/hbase-multiple-table-scans-for-the-job
7. Yarn的简单介绍_涛博-CSDN博客_yarn. https://blog.csdn.net/qq_33624952/article/details/79341034
8. 如何使用hadoop命令向CDH集群提交MapReduce作业 - 云+社区 - 腾讯云 . https://cloud.tencent.com/developer/article/1078452
9. 如何使用java命令从非集群节点向CDH集群提交MapReduce作业 - 云+社区 - 腾讯云 . https://cloud.tencent.com/developer/article/1078444
10. (6条消息) ResourceManager相关配置参数_似水流年-CSDN博客 . https://blog.csdn.net/xiaoshunzi111/article/details/50617357
11. java - Running MapReduce remotely - Stack Overflow. https://stackoverflow.com/questions/29268845/running-mapreduce-remotely
12. mapreduce - HBase: How to delete rows using Map-Reduce - Stack Overflow . https://stackoverflow.com/questions/6358872/hbase-how-to-delete-rows-using-map-reduce
13. mapreduce - Update and Delete Map/Reduce in HBase - Stack Overflow. https://stackoverflow.com/questions/22435013/update-and-delete-map-reduce-in-hbase
14. hadoop - Efficient way to delete multiple rows in HBase - Stack Overflow. https://stackoverflow.com/questions/4618980/efficient-way-to-delete-multiple-rows-in-hbase
15. hbase数据删除的辛酸_程序的简单生活-CSDN博客_hbase删除历史数据 . https://blog.csdn.net/hellowordlichao/article/details/27176151
16. MapReduce - Quick Guide . https://www.tutorialspoint.com/map_reduce/map_reduce_quick_guide.htm
17. Apache Pig Integration | Apache Phoenix . https://phoenix.apache.org/pig_integration.html
18. 11. Determine YARN and MapReduce Memory Configuration Settings - Hortonworks Data Platform. https://docs.cloudera.com/HDPDocuments/HDP2/HDP-2.0.6.0/bk_installing_manually_book/content/rpm-chap1-11.html
19. Configuring Memory for MapReduce Running on YARN - DZone Big Data . https://dzone.com/articles/configuring-memory-for-mapreduce-running-on-yarn
20. Hive mapreduce的map与reduce个数由什么决定?_lihuazaizheli的博客-CSDN博客 . https://blog.csdn.net/lihuazaizheli/article/details/107580462
21. HDFS中的文件写入到Mysql,通过DBConfiguration,DBOutputFormat_刘光华的专栏-CSDN博客. https://blog.csdn.net/zhoudetiankong/article/details/17027917
22. [YARN] Yarn下Mapreduce的内存参数理解 - SegmentFault 思否. https://segmentfault.com/a/1190000003777237
23. 17-mapreduce.pdf. https://web.stanford.edu/class/archive/cs/cs110/cs110.1202/static/lectures/17-mapreduce.pdf
24. HBase常用Java API - 知乎. https://zhuanlan.zhihu.com/p/138768143
25. Apache Hadoop 3.3.1 – MapReduce Tutorial. http://hadoop.apache.org/docs/r3.3.1/hadoop-mapreduce-client/hadoop-mapreduce-client-core/MapReduceTutorial.html
26. java - Access args[0] value in MapReduce - Stack Overflow. https://stackoverflow.com/questions/21700133/access-args0-value-in-mapreduce
27. Zstandard - Real-time data compression algorithm. http://facebook.github.io/zstd/
28. Choosing the number of reducers · paulhoule/infovore Wiki · GitHub. https://github.com/paulhoule/infovore/wiki/Choosing-the-number-of-reducers
29. MapReduce任务参数调优(转) - 悟寰轩-叶秋 - 博客园. https://www.cnblogs.com/sunxucool/p/4459006.html
30. mapreduce - How to optimize shuffling/sorting phase in a hadoop job - Stack Overflow. https://stackoverflow.com/questions/34186583/how-to-optimize-shuffling-sorting-phase-in-a-hadoop-job
31. Hadoop学习笔记—8.Combiner与自定义Combiner - EdisonZhou - 博客园. https://www.cnblogs.com/edisonchou/p/4297786.html

你可以使用Java代码提交Hive SQL到YARN上运行,以下是一个示例: ```java import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.security.UserGroupInformation; import org.apache.hadoop.yarn.api.records.ApplicationId; import org.apache.hadoop.yarn.api.records.LocalResource; import org.apache.hadoop.yarn.api.records.LocalResourceType; import org.apache.hadoop.yarn.api.records.LocalResourceVisibility; import org.apache.hadoop.yarn.client.api.YarnClient; import org.apache.hadoop.yarn.client.api.YarnClientApplication; import org.apache.hadoop.yarn.conf.YarnConfiguration; import org.apache.hadoop.yarn.util.Apps; import org.apache.hadoop.yarn.util.ConverterUtils; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.util.Collections; import java.util.HashMap; import java.util.Map; public class SubmitHiveSqlToYarn { public static void main(String[] args) throws Exception { // 初始化YARN配置 Configuration conf = new YarnConfiguration(); // 创建YARN客户端 YarnClient yarnClient = YarnClient.createYarnClient(); yarnClient.init(conf); yarnClient.start(); // 创建YARN应用 YarnClientApplication app = yarnClient.createApplication(); // 设置应用名称 String appName = "Hive SQL on YARN"; app.setApplicationName(appName); // 创建容器启动命令 String command = "hive -e \"SELECT * FROM my_table\""; // 设置本地资源 Map<String, LocalResource> localResources = new HashMap<>(); File scriptFile = new File("path/to/hive_script.hql"); FileInputStream scriptFileStream = new FileInputStream(scriptFile); LocalResource scriptFileResource = createLocalResource(scriptFileStream, conf); localResources.put("hive_script.hql", scriptFileResource); // 设置容器环境变量 Map<String, String> env = new HashMap<>(); env.put("CLASSPATH", "./*"); // 设置容器资源 Map<String, String> containerResources = new HashMap<>(); containerResources.put("hive_script.hql", scriptFile.getAbsolutePath()); // 设置容器启动命令 Apps.addToEnvironment(env, "HADOOP_USER_NAME", UserGroupInformation.getCurrentUser().getShortUserName()); Apps.addToEnvironment(env, "HADOOP_HOME", System.getenv("HADOOP_HOME")); Apps.addToEnvironment(env, "HADOOP_CONF_DIR", System.getenv("HADOOP_CONF_DIR")); String command = "hive -f hive_script.hql"; containerResources.put("script", scriptFile.getAbsolutePath()); // 设置应用主类 String mainClass = "org.apache.hadoop.yarn.applications.distributedshell.ApplicationMaster"; // 设置应用主类参数 String[] mainArgs = new String[]{ "--jar", "path/to/your/jar", "--shell_command", command, "--shell_env", "CLASSPATH=./*", "--shell_env", "HADOOP_USER_NAME=" + UserGroupInformation.getCurrentUser().getShortUserName(), "--shell_env", "HADOOP_HOME=" + System.getenv("HADOOP_HOME"), "--shell_env", "HADOOP_CONF_DIR=" + System.getenv("HADOOP_CONF_DIR"), "--container_memory", "1024", "--num_containers", "1" }; // 提交应用 ApplicationId appId = app.getNewApplicationResponse().getApplicationId(); Apps.addToEnvironment(env, "APP_ID", appId.toString()); Apps.addToEnvironment(env, "SCRIPT_NAME", "hive_script.hql"); Apps.addToEnvironment(env, "CONTAINER_RESOURCES", containerResources.toString()); Apps.addToEnvironment(env, "CONTAINER_LAUNCH_COMMAND", command); Apps.addToEnvironment(env, "CONTAINER_ENV", env.toString()); Apps.addToEnvironment(env, "MAIN_CLASS", mainClass); Apps.addToEnvironment(env, "MAIN_ARGS", mainArgs.toString()); Apps.addToEnvironment(env, "LOCAL_RESOURCES", localResources.toString()); Apps.addToEnvironment(env, "APP_NAME", appName); // 创建应用主类 YarnClientApplication clientApplication = yarnClient.createApplication(); ApplicationSubmissionContext appContext = clientApplication.getApplicationSubmissionContext(); appContext.setApplicationName(appName); // 设置资源需求 ResourceRequest resourceRequest = ResourceRequest.newInstance( Priority.newInstance(0), ResourceRequest.ANY, Resource.newInstance(1024, 1), 1 ); appContext.setAMContainerResourceRequest(resourceRequest); // 设置应用主类信息 ContainerLaunchContext amContainer = ContainerLaunchContext.newInstance( localResources, env, Collections.singletonList(command), null, null, null ); appContext.setAMContainerSpec(amContainer); // 提交应用 yarnClient.submitApplication(appContext); } private static LocalResource createLocalResource(FileInputStream fileInputStream, Configuration configuration) throws IOException { // 创建本地资源 LocalResource localResource = Records.newRecord(LocalResource.class); // 设置本地资源类型 localResource.setType(LocalResourceType.FILE); // 设置本地资源可见性 localResource.setVisibility(LocalResourceVisibility.APPLICATION); // 设置本地资源大小 FileStatus fileStatus = FileSystem.get(configuration).getFileStatus(new Path("path/to/hive_script.hql")); localResource.setSize(fileStatus.getLen()); // 设置本地资源时间戳 localResource.setTimestamp(fileStatus.getModificationTime()); // 设置本地资源位置 Path path = new Path("path/to/hive_script.hql"); URL url = ConverterUtils.getYarnUrlFromURI(path.toUri()); localResource.setResource(url); return localResource; } } ``` 上述代码,主要步骤如下: 1. 初始化YARN配置 2. 创建YARN客户端 3. 创建YARN应用 4. 设置应用名称 5. 创建容器启动命令 6. 设置本地资源 7. 设置容器环境变量 8. 设置容器资源 9. 设置容器启动命令 10. 设置应用主类 11. 设置应用主类参数 12. 提交应用 这个示例,我们使用了`hive -f`命令来执行Hive SQL脚本,你需要将`my_table`替换为你的表名,将`path/to/hive_script.hql`替换为你的Hive SQL脚本所在的路径,将`path/to/your/jar`替换为你的Java程序的jar包路径。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值