一、SparkUtils工具类
import org.apache.spark.{SparkConf, SparkContext}
object SparkUtils {
/**
* 默认的master url路径
*/
val DEFAULT_MASTER = "local[*]"
/**
* 默认master为local[*]的获取sparkContext
*/
def getSparkContext(appName:String):SparkContext = getSparkContext(appName, DEFAULT_MASTER)
def getSparkContext(appName:String, master:String):SparkContext = new SparkContext(new SparkConf().setAppName(appName).setMaster(master))
/**
* 释放sparkContext
*/
def close(sc:SparkContext) = if(sc != null) sc.stop()
}
二、日志工具
import org.apache.log4j.{Level, Logger}
trait LoggerTrait {
Logger.getLogger("org.apache.spark").setLevel(Level.WARN)
Logger.getLogger("org.apache.hadoop").setLevel(Level.WARN)
Logger.getLogger("org.spark_project").setLevel(Level.WARN)
}
三、Spark算子Sample
package cn.qphone.spark.core.day3_RDD_suanzi
import cn.qphone.spark.common.LoggerTrait.LoggerTrait
import cn.qphone.spark.common.Utils.SparkUtils
import org.apache.spark.rdd.RDD
object Deom4_Sample extends LoggerTrait {
def main(args: Array[String]): Unit = {
//1.sparkcontext获取
val sc = SparkUtils.getSparkContext("Deom4_Sample")
//2.数据
val list = 1 to 10
//3.加载RDD
val listRDD: RDD[Int] = sc.parallelize(list, 1)
//4.作用sample
var sampleRDD: RDD[Int] = listRDD.sample(true, 0.4)
Thread.sleep(1000)
println("样本空间元素个数:" + sampleRDD.count())
//5.打印
sampleRDD.foreach(println)
Thread.sleep(1000)
sampleRDD = listRDD.sample(true, 0.01)
Thread.sleep(1000)
println("样本空间元素个数:" + sampleRDD.count())
sampleRDD.foreach(println)
Thread.sleep(1000)
//6.释放资源
SparkUtils.close(sc)
}
}