- configure eclipse, add scala-ide plugin and m2e-scala plugin (http://alchim31.free.fr/m2e-scala/update-site/)
- configure spark to submit code to remote yarn
val sparkConf = new SparkConf().setAppName(s"Bulk Import $manualNbr").setMaster("yarn").set("deploy-mode", "client")
// .setJars(Seq("hdfs://ubuntu1:50071/jarcache/spark-assembly-2.3.0.jar",
// "hdfs://ubuntu1:50071/jarcache/spark-2.3.0-yarn-shuffle.jar"))
.set("spark.yarn.jars", "hdfs://ubuntu1:50071/cache/spark/import-0.0.1-SNAPSHOT.jar")
.set("spark.executor.memory", "512m")
.set("spark.driver.host", "192.168.1.105").set("spark.yarn.historyServer.address", "http://spark_history_server:history_port")
System.setProperty("HADOOP_CONF_DIR", "\\configuration\\src\\main\\resources");
System.setProperty("YARN_CONF_DIR", "\\configuration\\src\\main\\resources");
make sure hadoop and yarn configuration files are put under HADOOP_CONF_DIR and YARN_CONF_DIR
- no firewall is enabled so that yarn AM can talk to driver machine. (cannot connect to driver)
- make sure Yarn is configured well to avoid no resource can be allocated to start AM or other needed executors (cannot connect to driver)
<configuration>
<!--
<property>
<description>Amount of physical memory, in MB, that can be allocated
for containers.</description>
<name>yarn.nodemanager.resource.memory-mb</name>
<value>4096</value>
</property>
<property>
<description>Ratio between virtual memory to physical memory when
setting memory limits for containers. Container allocations are
expressed in terms of physical memory, and virtual memory usage
is allowed to exceed this allocation by this ratio.
</description>
<name>yarn.nodemanager.vmem-pmem-ratio</name>
<value>4</value>
</property>
<property>
<description>Number of vcores that can be allocated
for containers. This is used by the RM scheduler when allocating
resources for containers. This is not used to limit the number of
physical cores used by YARN containers.</description>
<name>yarn.nodemanager.resource.cpu-vcores</name>
<value>1</value>
</property>-->
<property>
<description>The hostname of the RM.</description>
<name>yarn.resourcemanager.hostname</name>
<value>ubuntu1</value>
</property>
<property>
<description>The address of the applications manager interface in the RM.</description>
<name>yarn.resourcemanager.address</name>
<value>${yarn.resourcemanager.hostname}:8032</value>
</property>
<property>
<description>The address of the scheduler interface.</description>
<name>yarn.resourcemanager.scheduler.address</name>
<value>${yarn.resourcemanager.hostname}:8030</value>
</property>
<property>
<description>Ratio between virtual memory to physical memory when
setting memory limits for containers. Container allocations are
expressed in terms of physical memory, and virtual memory usage
is allowed to exceed this allocation by this ratio.
</description>
<name>yarn.nodemanager.vmem-pmem-ratio</name>
<value>3</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>512</value>
</property>
<!-- Site specific YARN configuration properties -->
<property>
<name>yarn.nodemanager.aux-services</name>
<value>mapreduce_shuffle</value>
</property>
<property>
<name>yarn.nodemanager.aux-services.mapreduce.shuffle.class</name>
<value>org.apache.hadoop.mapred.ShuffleHandler</value>
</property>
<property>
<name>yarn.nodemanager.log-dirs</name>
<value>file:///home//bigdata/hadoop-2.7.6/userlog</value>
<final>true</final>
</property>
<property>
<name>yarn.nodemanager.local-dirs</name>
<value>file:///home//bigdata/hadoop-2.7.6/temp/nm-local-dir</value>
</property><property><name>yarn.log.server.url</name><value>...</value></property>
<property>
<name>yarn.nodemanager.delete.debug-delay-sec</name>
<value>600</value>
</property>
<!--<property>
<name>yarn.application.classpath</name>
<value>file:///home//bigdata/hadoop-2.7.6/,file:///home//bigdata/hadoop-2.7.6/share/hadoop/common/*,file:///home//bigdata/hadoop-2.7.6/share/hadoop/common/lib/*,file:///home//bigdata/hadoop-2.7.6/share/hadoop/hdfs/*,file:///home//bigdata/hadoop-2.7.6/share/hadoop/hdfs/lib/*,file:///home//bigdata/hadoop-2.7.6/share/hadoop/mapreduce/*,file:///home//bigdata/hadoop-2.7.6/share/hadoop/mapreduce/lib/*,file:///home//bigdata/hadoop-2.7.6/share/hadoop/yarn/*,file:///home//bigdata/hadoop-2.7.6/share/hadoop/yarn/lib/*</value>
</property>-->
</configuration>
- manage and resolve all dependencies with maven dependency exclusions
- assemble all codes with maven shade plugin
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-shade-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>shade</goal>
</goals>
<configuration>
<filters>
<filter>
<artifact>*:*</artifact>
<excludes>
<exclude>META-INF/*.SF</exclude>
<exclude>META-INF/*.DSA</exclude>
<exclude>META-INF/*.RSA</exclude>
<exclude>META-INF/LICENSE</exclude>
<exclude>LICENSE</exclude> <!--if this is same as above, not required -->
<exclude>/*.png</exclude>
<exclude>/*.html</exclude>
<exclude>/*.jpeg</exclude>
</excludes>
</filter>
</filters>
<artifactSet>
<excludes>
<exclude>junit:junit</exclude>
</excludes>
</artifactSet>
<transformers>
<transformer
implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" />
</transformers>
</configuration>
</execution>
</executions>
</plugin>
transformer: to merge service files with same name under meta-info folder
- debug, check logs in yarn or go to node manage machine to see details under <yarn.nodemanager.local-dirs>