spark的部分算子(二)
-
map,foreach,filte的使用
val sparkConf: SparkConf = new SparkConf() sparkConf.setMaster("local") sparkConf.setAppName("RDD_Test04") val sparkContext: SparkContext = new SparkContext(sparkConf) val array1 = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) val array3 = new Array[Int](100) for (array <- 0 until array3.length) { array3(array) += array } val rdd1: RDD[Int] = sparkContext.parallelize(array1) println("-------------map-------------") rdd1.map(x => x * 2).foreach(x => println(x)) rdd1.map(x => (x, x)).foreach(x => println(x)) println("-------------filter-------------") //保留奇数 rdd1.filter((x: Int) => if (x % 2 == 1) true else false).foreach(x => println(x))
-
flatMap的使用
val sparkConf: SparkConf = new SparkConf() sparkConf.setMaster("local") sparkConf.setAppName("RDD_Test04") val sparkContext: SparkContext = new SparkContext(sparkConf) val array2 = Array("hello huangbo", "hello xuzheng", "hello wangboqing") val rdd2: RDD[String] = sparkContext.makeRDD(array2) val rdd3 = rdd2.flatMap((line: String) => line.split(" ")) rdd3.foreach(x => println(x))
-
sample的采样算子的使用
println("-------------sample-------------") //采样算法 /** * withReplacement: Boolean,采取样的样本是否放回 * fraction: Double,采取的概率 * seed: Long = Utils.random.nextLong) */ val rdd5: RDD[Int] = sparkContext.makeRDD(array3) val str:String = rdd5.sample(true,0.05,0l).collect().mkString(",") println(str) val rdd4: Array[Int] = rdd5.takeSample(true,10, 0l) val str2:String =rdd4.mkString(",") println(str2)
-
mapPartitions和mapPartitionsWithIndex的使用
package day01 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object RDD_Test03 { def main(args: Array[String]): Unit = { val conf: SparkConf = new SparkConf() conf.setAppName("RDD_Test03") conf.setMaster("local") val context: SparkContext = new SparkContext(conf) val array1 = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10) val rdd1: RDD[Int] = context.parallelize(array1, 3) println("----------------mapPartitions-------------") val rdd3: RDD[Int] = rdd1.mapPartitions((x: Iterator[Int]) => { println("执行了一次分区处理") x.toList.map(x => x * x).toIterator }) rdd3.foreach(x => println(x)) println("------------------mapPartitionsWithIndex------------------------") // f: (Int, Iterator[T]) => Iterator[U] //Int 就是分区的编号 如果有4个分区那就是0,1,2,3 val rdd2: RDD[Int] = rdd1.mapPartitionsWithIndex((index: Int, data: Iterator[Int]) => { println("执行了一次分区处理 当前的分区编号是:" + index) data.map(x => x * x).toIterator }) rdd2.foreach(x => println(x)) } }