Spark 之RDD API大全

package scala

import org.apache.spark.{SparkConf, SparkContext}

/**
  * 
  */
object SparkAPI extends App {
  val conf = new SparkConf().setAppName("SparkTransformationTest").setMaster("local")
  val sc = new SparkContext(conf)


  /**
    * aggregate
    */
  def demoAggregate(): Unit = {
    val z = sc.parallelize(List(1, 2, 3, 4, 5, 6), 2)

    def myfunc(index: Int, iter: Iterator[(Int)]): Iterator[String] = {
      iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator
    }

    z.mapPartitionsWithIndex(myfunc).collect
    // res28: Array[String] = Array([partID:0, val: 1], [partID:0, val: 2], [partID:0, val: 3], [partID:1, val: 4], [partID:1, val: 5], [partID:1, val: 6])
    z.aggregate(0)(math.max(_, _), _ + _)
    // res40: Int = 9
    z.aggregate(5)(math.max(_, _), _ + _)
    // res29: Int = 16

    val z2 = sc.parallelize(List("a", "b", "c", "d", "e", "f"), 2)

    def myfunc2(index: Int, iter: Iterator[(String)]): Iterator[String] = {
      iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator
    }

    z2.mapPartitionsWithIndex(myfunc2).collect
    //res31: Array[String] = Array([partID:0, val: a], [partID:0, val: b], [partID:0, val: c], [partID:1, val: d], [partID:1, val: e], [partID:1, val: f])

    z2.aggregate("")(_ + _, _ + _)
    // res115: String = abcdef

    z2.aggregate("x")(_ + _, _ + _)
    //res116: String = xxdefxabc

    val z3 = sc.parallelize(List("12", "23", "345", "4567"), 2)
    z3.aggregate("")((x, y) => math.max(x.length, y.length).toString, (x, y) => x + y)
    // res141: String = 42

    z3.aggregate("")((x, y) => math.min(x.length, y.length).toString, (x, y) => x + y)
    // res142: String = 11

    val z4 = sc.parallelize(List("12", "23", "345", ""), 2)
    z4.aggregate("")((x, y) => math.min(x.length, y.length).toString, (x, y) => x + y)
    //  res143: String = 10
  }

  /**
    * aggregateByKey
    */
  def demoAggregateByKey(): Unit = {
    val pairRDD = sc.parallelize(List(("cat", 2), ("cat", 5), ("mouse", 4), ("cat", 12), ("dog", 12), ("mouse", 2)), 2)

    def myfunc(index: Int, iter: Iterator[(String, Int)]): Iterator[String] = {
      iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator
    }

    pairRDD.mapPartitionsWithIndex(myfunc).collect
    //res2: Array[String] = Array([partID:0, val: (cat,2)], [partID:0, val: (cat,5)], [partID:0, val: (mouse,4)], [partID:1, val: (cat,12)], [partID:1, val: (dog,12)], [partID:1, val: (mouse,2)])
    pairRDD.aggregateByKey(0)(math.max(_, _), _ + _).collect
    //  res3: Array[(String, Int)] = Array((dog,12), (cat,17), (mouse,6))
    pairRDD.aggregateByKey(100)(math.max(_, _), _ + _).collect
    //  res4: Array[(String, Int)] = Array((dog,100), (cat,200), (mouse,200))
  }

  /**
    * cartesian
    */
  def demo_cartesian(): Unit = {
    val x = sc.parallelize(List(1, 2, 3, 4, 5))
    val y = sc.parallelize(List(6, 7, 8, 9, 10))
    x.cartesian(y).collect
    // res0: Array[(Int, Int)] = Array((1,6), (1,7), (1,8), (1,9), (1,10), (2,6), (2,7), (2,8), (2,9), (2,10), (3,6), (3,7), (3,8), (3,9), (3,10), (4,6), (5,6), (4,7), (5,7), (4,8), (5,8), (4,9), (4,10), (5,9), (5,10))
  }

  /**
    * checkpoint
    */
  def demo_checkpoint(): Unit = {
    sc.setCheckpointDir("/tmp")
    val a = sc.parallelize(1 to 4)
    println(a.checkpoint)
    println(a.count)
  }

  /**
    * coalesce, repartition
    */
  def demo_coalesce(): Unit = {
    val y = sc.parallelize(1 to 10, 10)
    val z = y.coalesce(2, false)
    z.partitions.length
    //res9: Int = 2
  }

  /**
    * cogroup [Pair], groupWith [Pair]
    */
  def demo_cogroup(): Unit = {
    val a = sc.parallelize(List(1, 2, 1, 3), 1)
    val b = a.map((_, "b"))
    val c = a.map((_, "c"))
    b.cogroup(c).collect
    //    res7: Array[(Int, (Iterable[String], Iterable[String]))] = Array(
    //      (2,(ArrayBuffer(b),ArrayBuffer(c))),
    //      (3,(ArrayBuffer(b),ArrayBuffer(c))),
    //      (1,(ArrayBuffer(b, b),ArrayBuffer(c, c)))
    //    )

    val d = a.map((_, "d"))
    b.cogroup(c, d).collect
    //    res9: Array[(Int, (Iterable[String], Iterable[String], Iterable[String]))] = Array(
    //      (2,(ArrayBuffer(b),ArrayBuffer(c),ArrayBuffer(d))),
    //      (3,(ArrayBuffer(b),ArrayBuffer(c),ArrayBuffer(d))),
    //      (1,(ArrayBuffer(b, b),ArrayBuffer(c, c),ArrayBuffer(d, d)))
    //    )

    val x = sc.parallelize(List((1, "apple"), (2, "banana"), (3, "orange"), (4, "kiwi")), 2)
    val y = sc.parallelize(List((5, "computer"), (1, "laptop"), (1, "desktop"), (4, "iPad")), 2)
    x.cogroup(y).collect
    //    res23: Array[(Int, (Iterable[String], Iterable[String]))] = Array(
    //      (4,(ArrayBuffer(kiwi),ArrayBuffer(iPad))),
    //      (2,(ArrayBuffer(banana),ArrayBuffer())),
    //      (3,(ArrayBuffer(orange),ArrayBuffer())),
    //      (1,(ArrayBuffer(apple),ArrayBuffer(laptop, desktop))),
    //      (5,(ArrayBuffer(),ArrayBuffer(computer))))
  }

  /**
    * collect, toArray
    */
  def demo_collect(): Unit = {
    val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog", "Gnu", "Rat"), 2)
    c.collect
    // res29: Array[String] = Array(Gnu, Cat, Rat, Dog, Gnu, Rat)
  }

  /**
    * collectAsMap [Pair]
    */
  def demo_collectAsMap(): Unit = {
    val a = sc.parallelize(List(1, 2, 1, 3), 1)
    val b = a.zip(a)
    b.collectAsMap
    // res1: scala.collection.Map[Int,Int] = Map(2 -> 2, 1 -> 1, 3 -> 3)
  }

  /**
    * combineByKey[Pair]
    */
  def demo_combineByKey(): Unit = {
    val a = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3)
    val b = sc.parallelize(List(1, 1, 2, 2, 2, 1, 2, 2, 2), 3)
    val c = b.zip(a)
    val d = c.combineByKey(List(_), (x: List[String], y: String) => y :: x, (x: List[String], y: List[String]) => x ::: y)
    d.collect
    //    res16: Array[(Int, List[String])] = Array((1,List(cat, dog, turkey)), (2,List(gnu, rabbit, salmon, bee, bear, wolf)))
  }

  /**
    * countApproxDistinct
    */
  def demo_countApproxDistinct(): Unit = {
    val a = sc.parallelize(1 to 10000, 20)
    val b = a ++ a ++ a ++ a ++ a
    b.countApproxDistinct(0.1)
    //    res14: Long = 8224

    b.countApproxDistinct(0.05)
    //    res15: Long = 9750

    b.countApproxDistinct(0.01)
    //    res16: Long = 9947

    b.countApproxDistinct(0.001)
    //    res0: Long = 10000
  }

  /**
    * countApproxDistinctByKey [Pair]
    */
  def demo_countApproxDistinctByKey(): Unit = {
    val a = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog"), 2)
    val b = sc.parallelize(a.takeSample(true, 10000, 0), 20)
    val c = sc.parallelize(1 to b.count().toInt, 20)
    val d = b.zip(c)
    d.countApproxDistinctByKey(0.1).collect
    //    res15: Array[(String, Long)] = Array((Rat,2567), (Cat,3357), (Dog,2414), (Gnu,2494))

    d.countApproxDistinctByKey(0.01).collect
    //    res16: Array[(String, Long)] = Array((Rat,2555), (Cat,2455), (Dog,2425), (Gnu,2513))

    d.countApproxDistinctByKey(0.001).collect
    //    res0: Array[(String, Long)] = Array((Rat,2562), (Cat,2464), (Dog,2451), (Gnu,2521))
  }

  /**
    * countByKey [Pair]
    */
  def demo_countByKey(): Unit = {
    val c = sc.parallelize(List((3, "Gnu"), (3, "Yak"), (5, "Mouse"), (3, "Dog")), 2)
    c.countByKey
    //    res3: scala.collection.Map[Int,Long] = Map(3 -> 3, 5 -> 1)
  }

  /**
    * countByValue
    */
  def demo_countByValue(): Unit = {
    val b = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 2, 4, 2, 1, 1, 1, 1, 1))
    b.countByValue
    //    res27: scala.collection.Map[Int,Long] = Map(5 -> 1, 8 -> 1, 3 -> 1, 6 -> 1, 1 -> 6, 2 -> 3, 4 -> 2, 7 -> 1)

  }

  /**
    * distinct
    */
  def demo_distinct(): Unit = {
    val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog", "Gnu", "Rat"), 2)
    c.distinct.collect
    //    res6: Array[String] = Array(Dog, Gnu, Cat, Rat)

    val a = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10))
    a.distinct(2).partitions.length
    //    res16: Int = 2

    a.distinct(3).partitions.length
    //    res17: Int = 3
  }

  /**
    * filter
    */
  def demo_filter(): Unit = {
    val a1 = sc.parallelize(1 to 10, 3)
    val b1 = a1.filter(_ % 2 == 0)
    b1.collect
    //    res3: Array[Int] = Array(2, 4, 6, 8, 10)


    //---------------------------------------------------------------------
    val a = sc.parallelize(List("cat", "horse", 4.0, 3.5, 2, "dog"))
    a.collect({
      case a: Int => "is integer"
      case b: String => "is string"
    }).collect
    //    res17: Array[String] = Array(is string, is string, is integer, is string)

    val myfunc: PartialFunction[Any, Any] = {
      case a: Int => "is integer"
      case b: String => "is string"
    }
    myfunc.isDefinedAt("")
    //    res21: Boolean = true

    myfunc.isDefinedAt(1)
    //    res22: Boolean = true

    myfunc.isDefinedAt(1.5)
    //    res23: Boolean = false
  }

  /**
    * filterByRange
    */
  def demo_filterByRange(): Unit = {
    val randRDD = sc.parallelize(List((2, "cat"), (6, "mouse"), (7, "cup"), (3, "book"), (4, "tv"), (1, "screen"), (5, "heater")), 3)
    val sortedRDD = randRDD.sortByKey()

    sortedRDD.filterByRange(1, 3).collect.foreach(i => println(i._1 + ":" + i._2))
    //    res66: Array[(Int, String)] = Array((1,screen), (2,cat), (3,book))
  }

  /**
    * flatMap
    */
  def demo_flatMap(): Unit = {
    val a = sc.parallelize(1 to 10, 5)
    a.flatMap(1 to _).collect
    //    res47: Array[Int] = Array(1, 1, 2, 1, 2, 3, 1, 2, 3, 4, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5, 6, 7, 1, 2, 3, 4, 5, 6, 7, 8, 1, 2, 3, 4, 5, 6, 7, 8, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10)

    sc.parallelize(List(1, 2, 3), 2).flatMap(x => List(x, x, x)).collect
    //    res85: Array[Int] = Array(1, 1, 1, 2, 2, 2, 3, 3, 3)

    val x = sc.parallelize(1 to 10, 3)
    x.flatMap(List.fill(scala.util.Random.nextInt(10))(_)).collect

    //    res1: Array[Int] = Array(1, 2, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 8, 8, 8, 8, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10)
  }

  /**
    * flatMapValues
    *
    * def fold(zeroValue: T)(op: (T, T) => T): T
    */
  def demo_flatMapValues(): Unit = {
    val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)
    val b = a.map(x => (x.length, x))
    b.flatMapValues("x" + _ + "x").collect

    //    res6: Array[(Int, Char)] = Array((3,x), (3,d), (3,o), (3,g), (3,x), (5,x), (5,t), (5,i), (5,g), (5,e), (5,r), (5,x), (4,x), (4,l), (4,i), (4,o), (4,n), (4,x), (3,x), (3,c), (3,a), (3,t), (3,x), (7,x), (7,p), (7,a), (7,n), (7,t), (7,h), (7,e), (7,r), (7,x), (5,x), (5,e), (5,a), (5,g), (5,l), (5,e), (5,x))
  }

  /**
    * fold
    */
  def demo_fold(): Unit = {
    val a = sc.parallelize(List(1, 2, 3), 3)
    a.fold(0)(_ + _)
    //    res59: Int = 6
  }

  /**
    * foldByKey [Pair]
    * def foldByKey(zeroValue: V)(func: (V, V) => V): RDD[(K, V)]
    * def foldByKey(zeroValue: V, numPartitions: Int)(func: (V, V) => V): RDD[(K, V)]
    * def foldByKey(zeroValue: V, partitioner: Partitioner)(func: (V, V) => V): RDD[(K, V)]
    */
  def demo_foldByKey(): Unit = {
    val a = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), 2)
    val b = a.map(x => (x.length, x))
    b.foldByKey("")(_ + _).collect
    //    res84: Array[(Int, String)] = Array((3,dogcatowlgnuant)

    val a2 = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)
    val b2 = a2.map(x => (x.length, x))
    b2.foldByKey("")(_ + _).collect
    //    res85: Array[(Int, String)] = Array((4,lion), (3,dogcat), (7,panther), (5,tigereagle))
  }

  /**
    * foreachPartition
    * def foreachPartition(f: Iterator[T] => Unit)
    */
  def demo_foreachPartition(): Unit = {
    val b = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 9), 3)
    b.foreachPartition(x => println(x.reduce(_ + _)))
    //    6
    //    15
    //    24
  }

  /**
    * fullOuterJoin
    * def fullOuterJoin[W](other: RDD[(K, W)], numPartitions: Int): RDD[(K, (Option[V], Option[W]))]
    * def fullOuterJoin[W](other: RDD[(K, W)]): RDD[(K, (Option[V], Option[W]))]
    * def fullOuterJoin[W](other: RDD[(K, W)], partitioner: Partitioner): RDD[(K, (Option[V], Option[W]))]
    */
  def demo_fullOuterJoin(): Unit = {
    val pairRDD1 = sc.parallelize(List(("cat", 2), ("cat", 5), ("book", 4), ("cat", 12)))
    val pairRDD2 = sc.parallelize(List(("cat", 2), ("cup", 5), ("mouse", 4), ("cat", 12)))
    pairRDD1.fullOuterJoin(pairRDD2).collect

    //    res5: Array[(String, (Option[Int], Option[Int]))] = Array((book,(Some(4),None)), (mouse,(None,Some(4))), (cup,(None,Some(5))), (cat,(Some(2),Some(2))), (cat,(Some(2),Some(12))), (cat,(Some(5),Some(2))), (cat,(Some(5),Some(12))), (cat,(Some(12),Some(2))), (cat,(Some(12),Some(12))))

  }

  /**
    * groupBy
    *
    * def groupBy[K: ClassTag](f: T => K): RDD[(K, Iterable[T])]
    * def groupBy[K: ClassTag](f: T => K, numPartitions: Int): RDD[(K, Iterable[T])]
    * def groupBy[K: ClassTag](f: T => K, p: Partitioner): RDD[(K, Iterable[T])]
    */
  def demo_groupBy(): Unit = {

    val a = sc.parallelize(1 to 9, 3)
    a.groupBy(x => {
      if (x % 2 == 0) "even" else "odd"
    }).collect
    //    res42: Array[(String, Seq[Int])] = Array((even,ArrayBuffer(2, 4, 6, 8)), (odd,ArrayBuffer(1, 3, 5, 7, 9)))


    def myfunc(aa: Int): Int = {
      aa % 2
    }

    a.groupBy(myfunc).collect
    //    res3: Array[(Int, Seq[Int])] = Array((0,ArrayBuffer(2, 4, 6, 8)), (1,ArrayBuffer(1, 3, 5, 7, 9)))


    // a.groupBy(x => myfunc(x), 3).collect
    a.groupBy(myfunc(_), 1).collect
    //    res7: Array[(Int, Seq[Int])] = Array((0,ArrayBuffer(2, 4, 6, 8)), (1,ArrayBuffer(1, 3, 5, 7, 9)))
  }

  /**
    * groupByKey [Pair]
    * def groupByKey(): RDD[(K, Iterable[V])]
    * def groupByKey(numPartitions: Int): RDD[(K, Iterable[V])]
    * def groupByKey(partitioner: Partitioner): RDD[(K, Iterable[V])]
    */
  def demo_groupByKey(): Unit = {
    val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "spider", "eagle"), 2)
    val b = a.keyBy(_.length)
    b.groupByKey().collect()

    //    res11: Array[(Int, Seq[String])] = Array((4,ArrayBuffer(lion)), (6,ArrayBuffer(spider)), (3,ArrayBuffer(dog, cat)), (5,ArrayBuffer(tiger, eagle)))
  }

  /**
    * histogram [Double]
    * def histogram(bucketCount: Int): Pair[Array[Double], Array[Long]]
    * def histogram(buckets: Array[Double], evenBuckets: Boolean = false): Array[Long]
    */
  def demo_histogram(): Unit = {
    val a1 = sc.parallelize(List(1.1, 1.2, 1.3, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 9.0), 3)
    a1.histogram(5)
    //    res11: (Array[Double], Array[Long]) = (Array(1.1, 2.68, 4.26, 5.84, 7.42, 9.0),Array(5, 0, 0, 1, 4))

    val a2 = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3, 5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3)
    a2.histogram(6)
    //    res18: (Array[Double], Array[Long]) = (Array(1.0, 2.5, 4.0, 5.5, 7.0, 8.5, 10.0),Array(6, 0, 1, 1, 3, 4))

    val b1 = sc.parallelize(List(1.1, 1.2, 1.3, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 9.0), 3)
    b1.histogram(Array(0.0, 3.0, 8.0))
    //    res14: Array[Long] = Array(5, 3)

    val b2 = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3, 5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3)
    b2.histogram(Array(0.0, 5.0, 10.0))
    //    res1: Array[Long] = Array(6, 9)

    b2.histogram(Array(0.0, 5.0, 10.0, 15.0))
    //    res1: Array[Long] = Array(6, 8, 1)
  }

  /**
    * intersection
    */
  def demo_intersection(): Unit = {
    val x = sc.parallelize(1 to 20)
    val y = sc.parallelize(10 to 30)
    val z = x.intersection(y)
    z.collect

    //    res74: Array[Int] = Array(16, 12, 20, 13, 17, 14, 18, 10, 19, 15, 11)
  }

  /**
    * join
    */
  def demo_join(): Unit = {
    val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3)
    val b = a.keyBy(_.length)
    val c = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3)
    val d = c.keyBy(_.length)
    b.join(d).collect

    //    res0: Array[(Int, (String, String))] = Array((6,(salmon,salmon)), (6,(salmon,rabbit)), (6,(salmon,turkey)), (6,(salmon,salmon)), (6,(salmon,rabbit)), (6,(salmon,turkey)), (3,(dog,dog)), (3,(dog,cat)), (3,(dog,gnu)), (3,(dog,bee)), (3,(rat,dog)), (3,(rat,cat)), (3,(rat,gnu)), (3,(rat,bee)))

  }

  /**
    * lookup
    */
  def demo_lookup(): Unit = {
    val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)
    val b = a.map(x => (x.length, x))
    b.lookup(5)
    //    res0: Seq[String] = WrappedArray(tiger, eagle)
  }

  /**
    * map
    */
  def demo_map(): Unit = {
    val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3)
    val b = a.map(_.length)
    val c = a.zip(b)
    c.collect
    //    res0: Array[(String, Int)] = Array((dog,3), (salmon,6), (salmon,6), (rat,3), (elephant,8))
  }


  /**
    * mapPartitions
    */
  def demo_mapPartitions(): Unit = {
    val a = sc.parallelize(1 to 9, 3)

    def myfunc[T](iter: Iterator[T]): Iterator[(T, T)] = {
      var res = List[(T, T)]()
      var pre = iter.next
      while (iter.hasNext) {
        val cur = iter.next;
        res.::=(pre, cur)
        pre = cur;
      }
      res.iterator
    }

    a.mapPartitions(myfunc).collect

    //    res0: Array[(Int, Int)] = Array((2,3), (1,2), (5,6), (4,5), (8,9), (7,8))

  }


  /**
    * mapPartitionsWithIndex
    */
  def demo_mapPartitionsWithIndex(): Unit = {
    val x = sc.parallelize(List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10), 3)

    def myfunc(index: Int, iter: Iterator[Int]): Iterator[String] = {
      iter.toList.map(x => index + "," + x).iterator
    }

    x.mapPartitionsWithIndex(myfunc).collect()
    //    res10: Array[String] = Array(0,1, 0,2, 0,3, 1,4, 1,5, 1,6, 2,7, 2,8, 2,9, 2,10)
  }


  /**
    * mapValues
    */
  def demo_mapValues(): Unit = {
    val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)
    val b = a.map(x => (x.length, x))
    b.mapValues("x" + _ + "x").collect
    //    res5: Array[(Int, String)] = Array((3,xdogx), (5,xtigerx), (4,xlionx), (3,xcatx), (7,xpantherx), (5,xeaglex))
  }

  /**
    * max
    */
  def demo_max(): Unit = {
    val y = sc.parallelize(10 to 30)
    y.max
    //    res75: Int = 30
    val a = sc.parallelize(List((10, "dog"), (3, "tiger"), (9, "lion"), (18, "cat")))
    a.max
    //    res6: (Int, String) = (18,cat)
  }

  /**
    * min
    */
  def demo_min(): Unit = {
    val y = sc.parallelize(10 to 30)
    y.min
    //    res75: Int = 10

    val a = sc.parallelize(List((10, "dog"), (3, "tiger"), (9, "lion"), (8, "cat")))
    a.min
    //    res4: (Int, String) = (3,tiger)
  }

  /**
    * mean [Double], meanApprox [Double]
    */
  def demo_mean(): Unit = {
    val a = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3, 5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3)
    a.mean
    //    res0: Double = 5.3
  }


  /**
    * pipe
    */
  def demo_pipe(): Unit = {
    val a = sc.parallelize(1 to 9, 3)
    a.pipe("head -n 1").collect

    //    res2: Array[String] = Array(1, 4, 7)
  }

  /**
    * randomSplit
    */
  def demo_randomSplit(): Unit = {
    val y = sc.parallelize(1 to 10)
    val splits = y.randomSplit(Array(0.6, 0.4), seed = 11L)
    val training = splits(0)
    val test = splits(1)
    training.collect
    //    res:85 Array[Int] = Array(1, 4, 5, 6, 8, 10)
    test.collect
    //    res86: Array[Int] = Array(2, 3, 7, 9)

    //--------------------------------------
    val y2 = sc.parallelize(1 to 10)
    val splits2 = y2.randomSplit(Array(0.1, 0.3, 0.6))
    val rdd1 = splits2(0)
    val rdd2 = splits2(1)
    val rdd3 = splits2(2)
    rdd1.collect
    //    res87: Array[Int] = Array(4, 10)
    rdd2.collect
    //    res88: Array[Int] = Array(1, 3, 5, 8)
    rdd3.collect
    //    res91: Array[Int] = Array(2, 6, 7, 9)
  }

  /**
    * reduce
    */
  def demo_reduce(): Unit = {
    val a = sc.parallelize(1 to 100, 3)
    a.reduce(_ + _)
    //    res41: Int = 5050
  }

  /**
    * reduceByKey
    * def reduceByKey(func: (V, V) => V): RDD[(K, V)]
    * def reduceByKey(func: (V, V) => V, numPartitions: Int): RDD[(K, V)]
    * def reduceByKey(partitioner: Partitioner, func: (V, V) => V): RDD[(K, V)]
    * def reduceByKeyLocally(func: (V, V) => V): Map[K, V]
    * def reduceByKeyToDriver(func: (V, V) => V): Map[K, V]
    */
  def demo_reduceByKey(): Unit = {
    val a = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), 2)
    val b = a.map(x => (x.length, x))
    b.reduceByKey(_ + _).collect
    //    res86: Array[(Int, String)] = Array((3,dogcatowlgnuant))

    val a2 = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)
    val b2 = a2.map(x => (x.length, x))
    b2.reduceByKey(_ + _).collect
    //    res87: Array[(Int, String)] = Array((4,lion), (3,dogcat), (7,panther), (5,tigereagle))
  }

  /**
    * repartition
    */
  def demo_repartition(): Unit = {
    val rdd = sc.parallelize(List(1, 2, 10, 4, 5, 2, 1, 1, 1), 3)
    rdd.partitions.length
    //    res2: Int = 3
    val rdd2 = rdd.repartition(5)
    rdd2.partitions.length
    //    res6: Int = 5
  }

  /**
    * rightOuterJoin
    */
  def demo_rightOuterJoin(): Unit = {
    val a = sc.parallelize(List("dog", "salmon", "salmon", "rat", "elephant"), 3)
    val b = a.keyBy(_.length)
    val c = sc.parallelize(List("dog", "cat", "gnu", "salmon", "rabbit", "turkey", "wolf", "bear", "bee"), 3)
    val d = c.keyBy(_.length)
    b.rightOuterJoin(d).collect

    //    res2: Array[(Int, (Option[String], String))] = Array((6,(Some(salmon),salmon)), (6,(Some(salmon),rabbit)), (6,(Some(salmon),turkey)), (6,(Some(salmon),salmon)), (6,(Some(salmon),rabbit)), (6,(Some(salmon),turkey)), (3,(Some(dog),dog)), (3,(Some(dog),cat)), (3,(Some(dog),gnu)), (3,(Some(dog),bee)), (3,(Some(rat),dog)), (3,(Some(rat),cat)), (3,(Some(rat),gnu)), (3,(Some(rat),bee)), (4,(None,wolf)), (4,(None,bear)))
  }

  /**
    * sample
    */
  def demo_sample(): Unit = {
    val a = sc.parallelize(1 to 10000, 3)
    a.sample(false, 0.1, 0).count
    //    res24: Long = 960

    a.sample(true, 0.3, 0).count
    //    res25: Long = 2888

    a.sample(true, 0.3, 13).count
    //    res26: Long = 2985

  }

  /**
    * sampleByKey
    */
  def demo_sampleByKey(): Unit = {
    val randRDD = sc.parallelize(List((7, "cat"), (6, "mouse"), (7, "cup"), (6, "book"), (7, "tv"), (6, "screen"), (7, "heater")))
    val sampleMap = List((7, 0.4), (6, 0.6)).toMap
    randRDD.sampleByKey(false, sampleMap, 42).collect

    //    res6: Array[(Int, String)] = Array((7,cat), (6,mouse), (6,book), (6,screen), (7,heater))

  }

  /**
    * saveAsHadoopFile [Pair],
    * saveAsHadoopDataset [Pair],
    * saveAsNewAPIHadoopFile [Pair]
    *
    */
  def demo_saveAsHadoopFile(): Unit = {
    val listRDD = sc.parallelize(List(("name", "zhangsan"), ("age", 20), ("address", "wuhan")))
    listRDD.saveAsTextFile("/tmp/listRDD");
    // listRDD.saveAsHadoopFile("/tmp/listRDD.txt");
  }

  /**
    * saveAsObjectFile
    */

  def demo_saveAsObjectFile(): Unit = {
    val x = sc.parallelize(1 to 10, 3)
    x.saveAsObjectFile("objFile")
    val y = sc.objectFile[Int]("objFile")
    y.collect
    //    res52: Array[Int] =  Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
  }


  /**
    * saveAsSequenceFile
    */
  def demo_saveAsSequenceFile(): Unit = {
    val v = sc.parallelize(Array(("owl", 3), ("gnu", 4), ("dog", 1), ("cat", 2), ("ant", 5)), 2)
    v.saveAsSequenceFile("hd_seq_file")
    //    14/04/19 05:45:43 INFO FileOutputCommitter: Saved output of task 'attempt_201404190545_0000_m_000001_191' to file:/home/cloudera/hd_seq_file
    //
    //      [cloudera@localhost ~]$ ll ~/hd_seq_file
    //      total 8
    //    -rwxr-xr-x 1 cloudera cloudera 117 Apr 19 05:45 part-00000
    //    -rwxr-xr-x 1 cloudera cloudera 133 Apr 19 05:45 part-00001
    //    -rwxr - xr - x
    //    1 cloudera cloudera
    //    0 Apr 19
    //    05:
    //    45 _SUCCESS
  }

  /**
    * sortBy
    */
  def demo_sortBy(): Unit = {
    val y = sc.parallelize(Array(5, 7, 1, 3, 2, 1))
    y.sortBy(c => c, true).collect
    //    res101: Array[Int] = Array(1, 1, 2, 3, 5, 7)

    y.sortBy(c => c, false).collect
    //    res102: Array[Int] = Array(7, 5, 3, 2, 1, 1)

    val z = sc.parallelize(Array(("H", 10), ("A", 26), ("Z", 1), ("L", 5)))
    z.sortBy(c => c._1, true).collect
    //    res109: Array[(String, Int)] = Array((A,26), (H,10), (L,5), (Z,1))

    z.sortBy(c => c._2, true).collect
    //    res108: Array[(String, Int)] = Array((Z,1), (L,5), (H,10), (A,26))
  }

  /**
    * sortByKey
    */
  def demo_sortByKey(): Unit = {
    val a = sc.parallelize(List("dog", "cat", "owl", "gnu", "ant"), 2)
    val b = sc.parallelize(1 to a.count.toInt, 2)
    val c = a.zip(b)
    c.sortByKey(true).collect
    //    res74: Array[(String, Int)] = Array((ant,5), (cat,2), (dog,1), (gnu,4), (owl,3))
    c.sortByKey(false).collect
    //    res75: Array[(String, Int)] = Array((owl,3), (gnu,4), (dog,1), (cat,2), (ant,5))

    //=====================================================
    val aa = sc.parallelize(1 to 100, 5)
    val bb = a.cartesian(aa)
    val cc = sc.parallelize(bb.takeSample(true, 5, 13), 2)
    val dd = cc.sortByKey(false)
    //    res56: Array[(Int, Int)] = Array((96,9), (84,76), (59,59), (53,65), (52,4))
  }


  /**
    * subtract
    */
  def demo_subtract(): Unit = {
    val a = sc.parallelize(1 to 9, 3)
    val b = sc.parallelize(1 to 3, 3)
    val c = a.subtract(b)
    c.collect
    //    res3: Array[Int] = Array(6, 9, 4, 7, 5, 8)
  }

  /**
    * subtractByKey
    */
  def demo_subtractByKey(): Unit = {
    val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "spider", "eagle"), 2)
    val b = a.keyBy(_.length)
    val c = sc.parallelize(List("ant", "falcon", "squid"), 2)
    val d = c.keyBy(_.length)
    b.subtractByKey(d).collect
    //    res15: Array[(Int, String)] = Array((4,lion))
  }

  /**
    * sum
    */
  def demo_sum(): Unit = {
    val x = sc.parallelize(List(1.0, 2.0, 3.0, 5.0, 20.0, 19.02, 19.29, 11.09, 21.0), 2)
    x.sum
    //    res17: Double = 101.39999999999999
  }

  /**
    * take
    */
  def demo_take(): Unit = {
    val b = sc.parallelize(List("dog", "cat", "ape", "salmon", "gnu"), 2)
    b.take(2)
    //    res18: Array[String] = Array(dog, cat)

    val b2 = sc.parallelize(1 to 10000, 5000)
    b2.take(10)
    //    res6: Array[Int] = Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)
  }


  /**
    * takeOrdered
    */
  def demo_takeOrdered(): Unit = {
    val b = sc.parallelize(List("dog", "cat", "ape", "salmon", "gnu"), 2)
    b.takeOrdered(2)
    //    res19: Array[String] = Array(ape, cat)
  }

  /**
    * takeSample
    */
  def demo_takeSample(): Unit = {
    val x = sc.parallelize(1 to 1000, 3)
    x.takeSample(true, 100, 1)
  }


  /**
    * toJavaRDD
    */
  def demo_toJavaRDD(): Unit = {
    val c = sc.parallelize(List("Gnu", "Cat", "Rat", "Dog"), 2)
    c.toJavaRDD
    //    res3: org.apache.spark.api.java.JavaRDD[String] = ParallelCollectionRDD[6] at parallelize at <console>:12
  }


  /**
    * toLocalIterator
    */
  def demo_toLocalIterator(): Unit = {
    val z = sc.parallelize(List(1, 2, 3, 4, 5, 6), 2)
    val iter = z.toLocalIterator
    iter.next
    //    res51: Int = 1
    iter.next
    //    res52: Int = 2
  }

  /**
    * top
    */
  def demo_top(): Unit = {
    val c = sc.parallelize(Array(6, 9, 4, 7, 5, 8), 2)
    c.top(2)
    //    res28: Array[Int] = Array(9, 8)
  }


  /**
    * treeAggregate
    */
  def demo_treeAggregate(): Unit = {
    val z = sc.parallelize(List(1, 2, 3, 4, 5, 6), 2)

    def myfunc(index: Int, iter: Iterator[(Int)]): Iterator[String] = {
      iter.toList.map(x => "[partID:" + index + ", val: " + x + "]").iterator
    }

    z.mapPartitionsWithIndex(myfunc).collect
    //    res28: Array[String] = Array([partID:0, val: 1], [partID:0, val: 2], [partID:0, val: 3], [partID:1, val: 4], [partID:1, val: 5], [partID:1, val: 6])

    z.treeAggregate(0)(math.max(_, _), _ + _)
    //    res40: Int = 9

    z.treeAggregate(5)(math.max(_, _), _ + _)
    //    res42: Int = 11
  }

  /**
    * treeReduce
    */
  def demo_treeReduce(): Unit = {
    val z = sc.parallelize(List(1, 2, 3, 4, 5, 6), 2)
    z.treeReduce(_ + _)
    //    res49: Int = 21
  }

  /**
    * union, ++
    */
  def demo_union(): Unit = {
    val a = sc.parallelize(1 to 3, 1)
    val b = sc.parallelize(5 to 7, 1)
    (a ++ b).collect
    //    res0: Array[Int] = Array(1, 2, 3, 5, 6, 7)
  }

  /**
    * unpersist
    */
  def demo_unpersist(): Unit = {
    val y = sc.parallelize(1 to 10, 10)
    val z = (y ++ y)
    z.collect
    z.unpersist(true)
    //    14/04/19 03:04:57 INFO UnionRDD: Removing RDD 22 from persistence list
    //    14/04/19 03:04:57 INFO BlockManager: Removing RDD 22
  }


  /**
    * values
    */
  def demo_values(): Unit = {
    val a = sc.parallelize(List("dog", "tiger", "lion", "cat", "panther", "eagle"), 2)
    val b = a.map(x => (x.length, x))
    b.values.collect
    //    res3: Array[String] = Array(dog, tiger, lion, cat, panther, eagle)
  }

  /**
    * variance [Double], sampleVariance [Double]
    */
  def demo_variance(): Unit = {
    val a = sc.parallelize(List(9.1, 1.0, 1.2, 2.1, 1.3, 5.0, 2.0, 2.1, 7.4, 7.5, 7.6, 8.8, 10.0, 8.9, 5.5), 3)
    a.variance
    //  res70: Double = 10.605333333333332

    val x = sc.parallelize(List(1.0, 2.0, 3.0, 5.0, 20.0, 19.02, 19.29, 11.09, 21.0), 2)
    x.variance
    //  res14: Double = 66.04584444444443

    x.sampleVariance
    //  res13: Double = 74.30157499999999
  }

  /**
    * zip
    */

  def demo_zip(): Unit = {
    val a = sc.parallelize(1 to 100, 3)
    val b = sc.parallelize(101 to 200, 3)
    a.zip(b).collect
    //    res1: Array[(Int, Int)] = Array((1,101), (2,102), (3,103), (4,104), ...

    val aa = sc.parallelize(1 to 100, 3)
    val bb = sc.parallelize(101 to 200, 3)
    val cc = sc.parallelize(201 to 300, 3)
    a.zip(b).zip(cc).map((x) => (x._1._1, x._1._2, x._2)).collect
    //    res12: Array[(Int, Int, Int)] = Array((1,101,201), (2,102,202), (3,103,203),...
  }


  /**
    * zipParititions
    */

  def demo_zipParititions(): Unit = {
    val a = sc.parallelize(0 to 9, 3)
    val b = sc.parallelize(10 to 19, 3)
    val c = sc.parallelize(100 to 109, 3)

    def myfunc(aiter: Iterator[Int], biter: Iterator[Int], citer: Iterator[Int]): Iterator[String] = {
      var res = List[String]()
      while (aiter.hasNext && biter.hasNext && citer.hasNext) {
        val x = aiter.next + " " + biter.next + " " + citer.next
        res ::= x
      }
      res.iterator
    }

    a.zipPartitions(b, c)(myfunc).collect
    //    res50: Array[String] = Array(2 12 102, 1 11 101, 0 10 100, 5 15 105, 4 14 104, 3 13 103, 9 19 109, 8 18 108, 7 17 107, 6 16 106)
  }

  /**
    * zipWithIndex
    */

  def demo_zipWithIndex(): Unit = {
    val z = sc.parallelize(Array("A", "B", "C", "D"))
    val r = z.zipWithIndex
    //    res110: Array[(String, Long)] = Array((A,0), (B,1), (C,2), (D,3))

    val z2 = sc.parallelize(100 to 120, 5)
    val r2 = z2.zipWithIndex
    r2.collect
    //    res11: Array[(Int, Long)] = Array((100,0), (101,1), (102,2), (103,3), (104,4), (105,5), (106,6), (107,7), (108,8), (109,9), (110,10), (111,11), (112,12), (113,13), (114,14), (115,15), (116,16), (117,17), (118,18), (119,19), (120,20))

  }


  /**
    * zipWithUniqueId
    */

  def demo_zipWithUniqueId(): Unit = {
    val z = sc.parallelize(100 to 120, 5)
    val r = z.zipWithUniqueId
    r.collect

    //    res12: Array[(Int, Long)] = Array((100,0), (101,5), (102,10), (103,15), (104,1), (105,6), (106,11), (107,16), (108,2), (109,7), (110,12), (111,17), (112,3), (113,8), (114,13), (115,18), (116,4), (117,9), (118,14), (119,19), (120,24))
  }
}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值