Env
Ubuntu 20.04
hadoop-3.1.3
hive-3.1.2-bin
spark-3.3.0-bin-hadoop3
spark-defaults.conf
spark.master yarn
spark.driver.memory 512m
spark.yarn.am.memory 512m
spark.executor.memory 512m
spark-env.sh
export HADOOP_HOME=/path/to/hadoop
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
export JAVA_HOME=/path/to/java
export SCALA_HOME=/path/to/scala
Start:
spark-sql --driver-class-path /path/to/hive-3.1.2-bin/lib/mysql-connector-java-5.1.49.jar
- copy mysql driver:
cp /path/to/mysql-connector-java-5.1.49.jar $SPARK_HOME/jars
, then runspark-sql
Verify:
create tpcds dataset in Hive with: tpcds-for-hive-on-emr
errors
Failed to send RPC
error:
Change yarn conf:
<property>
<name>yarn.scheduler.maximum-allocation-mb</name>
<value>9216</value>
</property>
<property>
<name>yarn.scheduler.minimum-allocation-mb</name>
<value>4000</value>
</property>
<property>
<name>yarn.nodemanager.vmem-pmem-ratio</name>
<value>4.1</value>
</property>