一.安装scala
sudo tar -zxvf scala-2.11.6.tgz -C /opt
sudo mv scala-2.11.6 scala
sudo vim /etc/profile
#scala environment
export SCALA_HOME=/opt/scala
export PATH=${SCALA_HOME}/bin:$PATH
sudo source /etc/profile
二.安装sbt
sudo tar zxf sbt-1.2.8.tgz -C /opt
sudo mv sbt-1.2.8 sbt
sudo vim /etc/profile
#sbt environment
export SBT_HOME=/opt/sbt
export PATH=${SBT_HOME}/bin:$PATH
sudo source /etc/profile
sbt sbtVersion
三.安装spark
sudo tar -zvxf spark-3.0.0-bin-hadoop3.2.tgz -C /opt
mv spark-3.0.0-bin-hadoop3.2.tgz spark
sudo vim /etc/profile
#Spark environment
export SPARK_HOME=/opt/spark
export PATH=${SPARK_HOME}/bin:$PATH
#jupyter直接运行pyspark配置
export PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9-src.zip:$PYTHONPATH
export PYSPARK_PYTHON=/home/hadoop/envs/py3/bin/python3
export PYSPARK_DRIVER_PYTHON=/home/hadoop/envs/py3/bin/python3
#pyspark启动jupyter配置
#export PYSPARK_DRIVER_PYTHON=ipython
#export PYSPARK_DRIVER_PYTHON_OPTS="notebook"
sudo source /etc/profile
pyspark
直接jupyter notebook启动即可运行
from pyspark import SparkContext, SparkConf
conf=SparkConf()
conf.setAppName("My app")
sc.stop()
sc = SparkContext(conf=conf)
lines=sc.textFile("hdfs://localhost:9000/hive/zxx.db/t")
words=lines.flatMap(lambda line:line.split(" "))
keyvalue=words.map(lambda word:(word,1))
result=keyvalue.reduceByKey(lambda x,y:x+y)
print(result.collect())
spark连接hive
在spark-shell下测试是否支持spark连接hive,不支持的话安装支持的spark版本
scala> import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.sql.hive.HiveContext
配置/opt/spark/conf/spark-env.sh
export JAVA_HOME=/opt/java
export SPARK_DIST_CLASSPATH=$(/opt/hadoop/bin/hadoop classpath)
export CLASSPATH=$CLASSPATH:/opt/hive/lib
export SCALA_HOME=/opt/scala
export HADOOP_CONF_DIR=/opt/hadoop/etc/hadoop
export HIVE_CONF_DIR=/opt/hive/conf
export SPARK_CLASSPATH=$SPARK_CLASSPATH:/opt/saprk/jars/mysql-connector-java-5.1.47-bin.jar
cp /optl/hive/conf/hive-site.xml /opt/spark/conf
启动jupyter运行
from pyspark import SparkConf, SparkContext
from pyspark.sql import HiveContext
conf=SparkConf()
conf.setAppName("My app")
sc = SparkContext(conf = conf)
sqlContext = HiveContext(sc)
my_dataframe = sqlContext.sql("Select * from t")
my_dataframe.show()