解压
echo 'SPARK_HOME=$HOME/program/spark' >> ~/.bash_profile
echo 'PATH=$PATH:SPARK_HOME/bin' >> ~/.bash_profile
tar -zxvf spark.tar
mv spark $SPARK_HOME
修改配置文件
- spark-env.sh
cd $SPARK_HOME/conf
cp spark-env.sh.template spark-env.sh
vi spark-env.sh
export SCALA_HOME=/usr/local/bigdata/scala
export JAVA_HOME=/usr/local/bigdata/java/jdk1.8.0_211
export HADOOP_HOME=/usr/local/bigdata/hadoop-2.7.1
export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
SPARK_MASTER_IP=Master
SPARK_LOCAL_DIRS=/usr/local/bigdata/spark-2.4.3
export SPARK_WORKER_MEMORY=1g
export SPARK_DRIVER_MEMORY=1g
export PYSPARK_PYTHON=/usr/bin/python3
export PYSPARK_DRIVER_PYTHON=/usr/bin/python3
- slaves
cp slaves.template slaves
vi slaves
slave01
slave02
复制hive配置
cp $HIVE_HOME/conf/hive-site.xml $SPARK_HOME/conf/
vi $SPARK_HOME/conf/hive-site.xml
</configuration>
# ... append
# 支持pyspark
<property>
<name>hive.metastore.uris</name>
<value>thrift://10.10.10.90:9083</value>
</property>
</configuration>
启动
cd $HADOOP_HOME/sbin/
./start-all.sh
./start-history-server.sh
cd $SPARK_HOME/sbin/
./start-all.sh
./start-history-server.sh
查看
http://lx81:8080/
命令行模式
pyspark --master spark://lx81:7077 # python
spark-shell --master spark://lx81:7077 # scala
支持pyspark
# hive 服务
hive --service metastore
# thrift服务
$SPARK_HOME/sbin/start-thriftserver.sh
- 启动hive服务支持:enableHiveSupport()
# coding:utf-8
import os
os.environ['PYSPARK_PYTHON']='/usr/bin/python3'
os.environ['PYSPARK_DRIVER_PYTHON']='/usr/bin/python3'
from pyspark.sql import SparkSession
# 连接配置好的hive
spark = SparkSession.builder.master('local').appName('test').enableHiveSupport().getOrCreate()
# 设置自动分区
spark.sql('set hive.exec.dynamic.partition = true')
spark.sql('set hive.exec.dynamic.partition.mode = nonstrict')
spark.sql('drop table if exists temp.test')
# 写入hive
data = [(1,'3','145','1')
,(1,'4','146','1')
,(1,'5','25','2')
,(1,'6','26','2')
,(2,'32','32','3')
,(2,'8','134','3')
,(2,'8','134','3')
,(2,'9','137','3')
]
df = spark.createDataFrame(data, ['id', 'test_id', 'camera_id','pt'])
df.write.format('hive').mode('overwrite').partitionBy('pt').saveAsTable('temp.test')
# df.write.saveAsTable('temp.test',format='hive',mode='overwrite',partitionBy='pt')
# 读取hive
spark.sql('SELECT * from temp.test').show()
spark.stop()
error
- hostname不能带’_’,否则worker启动不了