1、上传scala-2.10.4.tgz 至 /usr 并解压
tar -zvxf scala-2.10.4.tgz
2、编辑/etc/profile
export SCALA_HOME=/usr/scala-2.10.4
export PATH=$PATH:${SCALA_HOME}/bin
更新系统设置
source /etc/profile
3、测试
[root@lining05 usr]# scala
Welcome to Scala version 2.10.4 (Java HotSpot(TM) 64-Bit Server VM, Java 1.7.0_79).
Type in expressions to have them evaluated.
Type :help for more information.
scala> 23*48
res0: Int = 1104
4、上传spark-1.3.0.tgz至/opt/modules/cdh 并解压
tar -zxvf spark-1.2.0-cdh5.3.6.tar.gz
5、修改 make-distribution.sh
VERSION=1.3.0
SCALA_VERSION=2.10
SPARK_HADOOP_VERSION=2.5.0-cdh5.3.6
SPARK_HIVE=1
JAVA_CMD="$JAVA_HOME"/bin/java
JAVA_VERSION=1.7.0_79
6、运行make-distribution.sh脚本 编译spark1.3.0
/opt/modules/cdh/spark-1.3.0/make-distribution.sh --tgz -Phadoop-2.4 -Dhadoop.version=2.5.0 -Pyarn -Phive-0.13.1 -Phive-thriftserver
编译过程较长,中间如果出现断网或停滞可以多编译几次,另,有关mqtt的依赖没有找到,删掉pom中的mqtt 以及 examples pom中的mqtt依赖才能顺利完成编译。本次编译本人共计10小时完成
7、配置环境变量
export SPARK_HOME=/opt/modules/cdh/spark-1.3.0-bin-2.5.0
export PATH=$PATH:${SPARK_HOME}/bin
export HADOOP_CONF_DIR=/opt/modules/cdh/hadoop-2.5.0-cdh5.3.6/etc/hadoop
export HDFS_CONF_DIR=/opt/modules/cdh/hadoop-2.5.0-cdh5.3.6/etc/hadoop
export YARN_CONF_DIR=/opt/modules/cdh/hadoop-2.5.0-cdh5.3.6/etc/hadoop
source /etc/profile
8、配置 ${SPARK_HOME}/conf/spark-env.sh
export JAVA_HOME=/usr/java/jdk1.7.0_79
export SCALA_HOME=/usr/scala-2.10.4
export HADOOP_HOME=/opt/modules/cdh/hadoop-2.5.0-cdh5.3.6
export HADOOP_CONF_DIR=/opt/modules/cdh/hadoop-2.5.0-cdh5.3.6/conf
export SPARK_MASTER_IP=192.168.198.131
export SPARK_MASTER_PORT=8070
export SPARK_MASTER_WEBUI_PORT=8090
export SPARK_WORKER_CORES=1
export SPARK_WORKER_INSTANCES=1
export SPARK_WORKER_PORT=8092
export SPARK_WORKER_MEMORY=1g
export SPARK_MASTER_WEBUI_PORT=8091
export SPARK_EXECUTOR_INSTANCES=2
export SPARK_EXECUTOR_CORES=1
export SPARK_EXECUTOR_MEMORY=1g
export SPARK_DRIVER_MEMORY=1g
export SPARK_YARN_APP_NAME=Spark
9、修改hadoop yarn-site.xml 添加
<property>
<name>yarn.nodemanager.vmem-check-enabled</name>
<value>false</value>
</property>
<property>
<name>yarn.nodemanager.pmem-check-enabled</name>
<value>false</value>
</property>
10、拷贝 yarn-site.xml, hdfs-site.xml, core-site.xml 到$SPARK_HOME
11、配置spark slaves
lining06
lining07
12、将spark 拷贝到lining06、lining07
/opt/modules/cdh/spark-1.3.0-bin-2.5.0/sbin/start-all.sh
13、启动spark
/opt/modules/cdh/spark-1.3.0-bin-2.5.0/sbin/start-all.sh
14、启动spark shell
/opt/modules/cdh/spark-1.3.0-bin-2.5.0/bin/spark-shell --master yarn --deploy-mode client
实验数据
spark01.txt 学生分数
Tom,DataBase,80
Tom,Algorithm,50
Tom,DataStructure,60
Jim,DataBase,90
Jim,Algorithm,60
Jim,DataStructure,80
hdfs dfs -mkdir /opt/test
hdfs dfs -put /opt/test/spark01.txt /opt/test
15、该系总共有多少学生?
//加载文件
val lines=sc.textFile("/opt/test/spark01.txt")
// 获取新数据集(每行只包括原来每行的第一个逗号前的字符串)
val par=lines.map(row=>row.split(",")(0))
// 获取新的去重之后的数据集
val distinct_par=par.distinct()
// 计算数据集的条数
distinct_par.count
yarn
res0: Long = 2
16、Tom 的总成绩是多少
val lines=sc.textFile("/opt/test/spark01.txt")
//过滤掉第一个逗号前的字符串不等于'Tom'的行
val tom=lines.filter(row=>row.split(",")(0)=="Tom")
lines.collect
// 获取 Double数据集
val score = tom.map(row=>row.split(",")(2).toInt)
score.collect
// 使用Double集的mean方法计算平均值
score.mean
res4: Double = 63.333333333333336
17、获得每个学生的总分二元RDD
//加载数据集
val lines=sc.textFile("/opt/test/spark01.txt")
//把每行转换成 <学员,分数(int)>的二元组
val score = lines.map(row=>(row.split(",")(0),row.split(",")(2).toInt))
// 使用二元数据集的reduceByKey函数计算每个学员的总分
val sumscore = score.reduceByKey((x,y)=>x+y);
res7: Array[(String, Int)] = Array((Tom,190), (Jim,230))
18、获得每个学员的平均分二元RDD
//加载数据集
val lines=sc.textFile("/opt/test/spark01.txt")
//把每行转换成 <学员,分数(int)>的二元组
val score = lines.map(row=>(row.split(",")(0),row.split(",")(2).toInt))
//给二元组的值加一个个数数据 1 用来累加科目的数目
val score1 = score.mapValues(x=>(x.toInt,1.toInt))
score1.collect
res16: Array[(String, (Int, Int))] = Array((Tom,(80,1)), (Tom,(50,1)), (Tom,(60,1)), (Jim,(90,1)), (Jim,(60,1)), (Jim,(80,1)))
//使用二元数据集的reduceByKey函数计算每个学员的总分和科目总数
val sumscore = score1.reduceByKey((x,y)=>(x._1+y._1,x._2+y._2))
sumscore.collect
res15: Array[(String, (Int, Int))] = Array((Tom,(190,3)), (Jim,(230,3)))
//使用mapValues计算每个学员的平均分
val meanscore = sumscore.mapValues(x=>x._1/x._2)
meanscore.collect
res17: Array[(String, Int)] = Array((Tom,63), (Jim,76))
19、伪分布式安装kafka
上传 kafka_2.10-0.8.2.1.tgz 到 /opt/modules/cdh
解压
tar -zvxf kafka_2.10-0.8.2.1.tgz
20、配置KAFKA_HOME /etc/profile
export KAFKA_HOME=/opt/modules/cdh/kafka_2.10-0.8.2.1
export PATH=$KAFKA_HOME/bin:$PATH
source /etc/profile
21、替换zookeeper jar包
22、设置 server.property
broker.id=0
host.name=lining05
advertised.host.name=lining05
log.dirs=/opt/modules/cdh/kafka_2.10-0.8.2.1/logs
zookeeper.connect=lining05:2181,lining06:2181,lining07:2181
23、复制kafka到lining06、lining07
修改server.properties文件中的broker.id 、 host.name
24、授权
chown -R hadoop:hadoop /home/hadoop/kafka
22、启动kafka
/opt/modules/cdh/kafka_2.10-0.8.2.1/bin/kafka-server-start.sh config/server.properties