单机版Spark安装
[root@linux01 install]# tar zxf spark-2.4.5-bin-hadoop2.6.tgz -C ../soft
[root@linux01 soft]# mv spark-2.4.5-bin-hadoop2.6.tgz spark245
[root@linux01 conf]# cp spark-env.sh.template spark-env.sh
[root@linux01 conf]# vi ./spark-env.sh
最后一行添加
export JAVA_HOME=/opt/soft/jdk180
export SCALA_HOME=/opt/soft/scala211
export SPARK_HOME=/opt/soft/spark245
export HADOOP_INSTALL=/opt/soft/hadoop260
export HADOOP_CONF_DIR=$HADOOP_INSTALL/etc/hadoop
export SPARK_MASTER_IP=linux01
export SPARK_DRIVER_MEMORY=2G
export SPARK_EXECUTOR_MEMORY=2G
export SPARK_LOCAL_DIRS=/opt/soft/spark245
[root@linux01 ~]# vi /etc/profile
最后一行添加
#scala
export SCALA_HOME=/opt/soft/scala211
export PATH=$PATH:$SCALA_HOME/bin
#spark
export SPARK_HOME=/opt/soft/spark245
export PATH=$PATH:$SPARK_HOME/bin
[root@linux01 conf]# cd /opt/soft/spark245/sbin/
[root@linux01 sbin]# ./start-all.sh
[root@linux01 ~]# spark-shell
WordCount例子
Spark函数
++ countByValue histogram partitions saveAsTextFile toLocalIterator
aggregate countByValueApprox id persist setName toString
barrier dependencies intersection pipe sortBy top
cache distinct isCheckpointed popStdev sparkContext treeAggregate
canEqual filter isEmpty popVariance stats treeReduce
cartesian first iterator preferredLocations stdev union
checkpoint flatMap keyBy productArity subtract unpersist
coalesce fold localCheckpoint productElement sum variance
collect foreach map productIterator sumApprox zip
collectAsync foreachAsync mapPartitions productPrefix take zipPartitions
compute foreachPartition mapPartitionsWithIndex randomSplit takeAsync zipWithIndex
context foreachPartitionAsync max reduce takeOrdered zipWithUniqueId
copy getCheckpointFile mean repartition takeSample
count getNumPartitions meanApprox sample toDF
countApprox getStorageLevel min sampleStdev toDS
countApproxDistinct glom name sampleVariance toDebugString
countAsync groupBy partitioner saveAsObjectFile toJavaRDD
交互
[root@linux01 conf]# spark-shell --master local[*]
[root@linux01 conf]# spark-shell --master spark://linux01:7077
[root@linux01 conf]# spark-shell --master yarn-client
scala> sc.textFile("hdfs://linux01:9000/sparktmp/hello.txt")
scala> sc.textFile("hdfs://linux01:9000/sparktmp/hello.txt").flatMap(x=>x.split(" "))
scala> sc.textFile("hdfs://linux01:9000/sparktmp/hello.txt").flatMap(x=>x.split(" "))
.map(x=>(x,1)).reduceByKey((x,y)=>x+y)
scala> sc.textFile("hdfs://linux01:9000/sparktmp/hello.txt").flatMap(x=>x.split(" "))
.map(x=>(x,1)).reduceByKey((x,y)=>x+y)
.collect.foreach(println)
IDEA实现 WordCount例子
找寻log4j.properties
package cn.kgc.kb15
import java.io.FileInputStream
import java.util.Properties
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}
object SparkDemo {
def main(args: Array[String]): Unit = {
val conf:SparkConf=new SparkConf().setMaster("local[*]").setAppName("sparkDemo")
val sc:SparkContext=SparkContext.getOrCreate(conf)
println(sc)
val spark:SparkSession=SparkSession.builder().master("local[*]").appName("sparkDemo").getOrCreate()
println(spark)
// val rdd:RDD[String]=sc.textFile("hdfs://192.168.111.131:9000/sparktmp/hello.txt")
// println("-------------------------------rdd")
// rdd.collect.foreach(println)
// val rdd2:RDD[String]=rdd.flatMap(x=>x.split(" "))
// println("-------------------------rdd2")
// rdd2.collect().foreach(println)
// val rdd3:RDD[(String,Int)]=rdd2.map(x=>(x,1))
// println("----------------------------rdd3")
// rdd3.collect().foreach(println)
// val rdd4:RDD[(String,Int)]=rdd3.reduceByKey((x,y)=>x+y)
// println("---------------------------rdd4")
// rdd4.collect().foreach(println)
//sc 读取本地文件创建 rdd
// val rdd11:RDD[String]=sc.textFile("E:\\创建工程\\sparkStu2021_12_06\\in\\helloworld.txt")
// rdd11.collect().foreach(println)
// println("-------------------------------------------")
// rdd11.flatMap(x=>x.split(" ")).map((_,1)).reduceByKey(_+_).collect().foreach(println)
// val rdd:RDD[Int]=sc.parallelize(1 to 10)
// val rdd1:RDD[String]=sc.parallelize(Array("hello java","hello hive","hello spark"))
// val rdd2:RDD[Int]=sc.makeRDD(Array(1,2,3,4))
// rdd.collect().foreach(println)
// rdd1.collect().foreach(println)
// rdd2.collect().foreach(println)
// val path=Thread.currentThread().getContextClassLoader.getResource("test.properties").getPath
// println(path)
val prop =new Properties()
prop.load(new FileInputStream(Consol.Target_File))
val target=prop.getProperty(Consol.PATH)
println(target)
val rdd11:RDD[String]=sc.textFile(target)
/**
* public static String Target_File="/opt/sparktmp/test.properties";
*
* public static String PATH="path";
*/
rdd11.flatMap(x=>x.split(" ")).map((_,1)).reduceByKey(_+_).collect().foreach(println)
//
// val prop =new Properties()
// prop.load(new FileInputStream("/opt/sparktmp/test.properties"))
// val target=prop.getProperty("path")
// println(target)
// val rdd11:RDD[String]=sc.textFile(target)
// rdd11.flatMap(x=>x.split(" ")).map((_,1)).reduceByKey(_+_).collect().foreach(println)
//val prop =new Properties()
// prop.load(new FileInputStream("E:\\创建工程\\sparkStu2021_12_06\\resources\\test.properties"))
// val target=prop.getProperty("path")
// println(target)
// val rdd11:RDD[String]=sc.textFile(target)
// rdd11.flatMap(x=>x.split(" ")).map((_,1)).reduceByKey(_+_).collect().foreach(println)
}
}
package cn.kgc.kb15;
public class Consol {
/**
*spark 获取文件时读取的指定文件夹
*/
public static String Target_File="/opt/sparktmp/test.properties";
public static String PATH="path";
}
导出Jar包
spark-submit --class cn.kgc.kb15.SparkDemo --master local[*] ./jar包
spark-submit --class cn.kgc.kb15.SparkDemo --master spark://linux01:7077 local[*] ./jar包
spark-submit --class cn.kgc.kb15.SparkDemo --master yarn-client ./jar包
(scala,1)
(are,1)
(you,1)
(hive,1)
(how,1)
(hello,5)
(java,1)
(hbase,1)
(world,1)