20191203SparkRDD

package mby00

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}


// 翻滚窗口
object TumblingWindowTest {

  // main
  def main(args: Array[String]): Unit = {


    val conf = new SparkConf().setMaster("local[*]").setAppName("Tumbling Window Test")
    val ssc = new StreamingContext(conf,Seconds(1))
    ssc.sparkContext.setLogLevel("OFF")
    val ds = ssc.socketTextStream("SparkOnStandalone",8888)

    ds
      .flatMap(_.split(" "))
      .map((_,1))
      .window(Seconds(4))
      .print()

    ssc.start()
    ssc.awaitTermination()
  }

}

// window(Seconds(5))
/**-------------------------------------------
Time: 1575351312000 ms
-------------------------------------------
(ddd,1)

-------------------------------------------
Time: 1575351313000 ms
-------------------------------------------
(ddd,1)

-------------------------------------------
Time: 1575351314000 ms
-------------------------------------------
(ddd,1)

-------------------------------------------
Time: 1575351315000 ms
-------------------------------------------
(ddd,1)

-------------------------------------------
Time: 1575351316000 ms
-------------------------------------------
(ddd,1)
  */

// window(Seconds(4))
/**
-------------------------------------------
Time: 1575351458000 ms
-------------------------------------------
(mmm,1)

-------------------------------------------
Time: 1575351459000 ms
-------------------------------------------
(mmm,1)

-------------------------------------------
Time: 1575351460000 ms
-------------------------------------------
(mmm,1)

-------------------------------------------
Time: 1575351461000 ms
-------------------------------------------
(mmm,1)
*/
package mby00

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}

// 滑动窗口
// ok 
object SlidingWindowTest {

  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("Sliding Window Test")
    val ssc = new StreamingContext(conf,Seconds(1))
    ssc.sparkContext.setLogLevel("OFF")
    val ds = ssc.socketTextStream("SparkOnStandalone",8888)

    ds
        .flatMap(_.split(" "))
        .map((_,1))
        .window(Seconds(5),Seconds(1))
        .print()



    ssc.start()
    ssc.awaitTermination()

  }

}
// .window(Seconds(5),Seconds(2))
/**Time: 1575352459000 ms
-------------------------------------------
(bbbb,1)

-------------------------------------------
Time: 1575352461000 ms
-------------------------------------------
(bbbb,1)*/


// .window(Seconds(5),Seconds(1))
/**
//Time: 1575352595000 ms
//-------------------------------------------
//(mby,1)
//
//-------------------------------------------
//Time: 1575352596000 ms
//-------------------------------------------
//(mby,1)
//
//-------------------------------------------
//Time: 1575352597000 ms
//-------------------------------------------
//(mby,1)
//
//-------------------------------------------
//Time: 1575352598000 ms
//-------------------------------------------
//(mby,1)
//
//-------------------------------------------
//Time: 1575352599000 ms
//-------------------------------------------
//(mby,1)
*/

package mby00

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}

// 翻滚窗口
// ok
object TumblingWindowTest {

// main
def main(args: Array[String]): Unit = {

val conf = new SparkConf().setMaster("local[*]").setAppName("Tumbling Window Test")
val ssc = new StreamingContext(conf,Seconds(1))
ssc.sparkContext.setLogLevel("OFF")
val ds = ssc.socketTextStream("SparkOnStandalone",8888)

ds
  .flatMap(_.split(" "))
  .map((_,1))
  .window(Seconds(4))
  .print()

ssc.start()
ssc.awaitTermination()

}

}

// window(Seconds(5))
/**-------------------------------------------
Time: 1575351312000 ms
-------------------------------------------
(ddd,1)

-------------------------------------------
Time: 1575351313000 ms
-------------------------------------------
(ddd,1)

-------------------------------------------
Time: 1575351314000 ms
-------------------------------------------
(ddd,1)

-------------------------------------------
Time: 1575351315000 ms
-------------------------------------------
(ddd,1)

-------------------------------------------
Time: 1575351316000 ms
-------------------------------------------
(ddd,1)
  */

// window(Seconds(4))
/**
-------------------------------------------
Time: 1575351458000 ms
-------------------------------------------
(mmm,1)

-------------------------------------------
Time: 1575351459000 ms
-------------------------------------------
(mmm,1)

-------------------------------------------
Time: 1575351460000 ms
-------------------------------------------
(mmm,1)

-------------------------------------------
Time: 1575351461000 ms
-------------------------------------------
(mmm,1)
*/

package mby00

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}

// 翻滚窗口
// ok
object TumblingWindowTest {

// main
def main(args: Array[String]): Unit = {

val conf = new SparkConf().setMaster("local[*]").setAppName("Tumbling Window Test")
val ssc = new StreamingContext(conf,Seconds(1))
ssc.sparkContext.setLogLevel("OFF")
val ds = ssc.socketTextStream("SparkOnStandalone",8888)

ds
  .flatMap(_.split(" "))
  .map((_,1))
  .window(Seconds(4))
  .print()

ssc.start()
ssc.awaitTermination()

}

}

package mby00

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}

import scala.collection.mutable.ListBuffer


// 热销榜案例
object HotBoardTest {


  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("Hot Board Test")
    val ssc = new StreamingContext(conf,Seconds(1))
    ssc.sparkContext.setLogLevel("OFF")
    val ds = ssc.socketTextStream("SparkOnStandalone",8888)

    // ok
    /*ds
      .map(line => {
         val arr = line.split(" ")
          val productName = arr(2)
          val categoryId = arr(4)
          ((productName,categoryId),1)
        })
      .print()
    /**((羽绒服,B),1)*/*/

    /*// ok
    ds
        .map(line => {
          val arr = line.split(" ")
          val productName = arr(2)
          val categoryId = arr(4)
          ((productName,categoryId),1)
        })
        .window(Seconds(5))
        .print()
   /** -------------------------------------------
    ((羽绒服,B),1)

    -------------------------------------------
    Time: 1575354081000 ms
      -------------------------------------------
    ((羽绒服,B),1)

    -------------------------------------------
    Time: 1575354082000 ms
      -------------------------------------------
    ((羽绒服,B),1)

    -------------------------------------------
    Time: 1575354083000 ms
      -------------------------------------------
    ((羽绒服,B),1)

    -------------------------------------------
    Time: 1575354084000 ms
      -------------------------------------------
    ((羽绒服,B),1)

    -------------------------------------------
    Time: 1575354085000 ms
      ---------------*/*/

    ds
        .map(line =>{
          val arr = line.split(" ")
          val productName = arr(2)
          val categoryId = arr(4)
          ((productName,categoryId),1)
        })
        .window(Seconds(5))
        .groupByKey()
        .print()
    /**
    ((羽绒服,B),ArrayBuffer(1))

    -------------------------------------------
    Time: 1575354384000 ms
      -------------------------------------------
    ((羽绒服,B),ArrayBuffer(1))

    -------------------------------------------
    Time: 1575354385000 ms
      -------------------------------------------
    ((羽绒服,B),ArrayBuffer(1))

    -------------------------------------------
    Time: 1575354386000 ms
      -------------------------------------------
    ((羽绒服,B),ArrayBuffer(1))

    -------------------------------------------
    Time: 1575354387000 ms
      -------------------------------------------
    ((羽绒服,B),ArrayBuffer(1))*/

    /**-------------------------------------------
    ((iphone11,A),ArrayBuffer(1))

    -------------------------------------------
    Time: 1575354548000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1))

    -------------------------------------------
    Time: 1575354549000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1))

    -------------------------------------------
    Time: 1575354550000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1))

    -------------------------------------------
    Time: 1575354551000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1))*/


    // 在翻滚窗口的时间间隔内 连续输入一下的内容
    // 101 1 iphone11 4999.0 A
    //101 1 iphone11 4999.0 A
    //101 1 iphone11 4999.0 A

    /**Time: 1575355044000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1, 1, 1))

    -------------------------------------------
    Time: 1575355044000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1, 1, 1))

    -------------------------------------------
    Time: 1575355045000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1, 1, 1))

    -------------------------------------------
    Time: 1575355045000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1, 1, 1))

    -------------------------------------------
    Time: 1575355046000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1, 1, 1))*/

    // ok
    /*ds
        .map(line =>{
          val arr = line.split(" ")
          val productName = arr(2)
          val categoryId = arr(4)
          ((productName,categoryId),1)
        })
        .window(Seconds(5))
        .groupByKey()
       .map(t2 =>{
          ((t2._1._1,t2._1._2),t2._2.size)
        })
        .print()*/
   /**
    ((iphone11,A),ArrayBuffer(1))

    -------------------------------------------
    Time: 1575355305000 ms
      -------------------------------------------
    ((iphone11,A),1)

    -------------------------------------------
    Time: 1575355306000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1, 1, 1))

    -------------------------------------------
    Time: 1575355306000 ms
      -------------------------------------------
    ((iphone11,A),3)

    -------------------------------------------
    Time: 1575355307000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1, 1, 1))

    -------------------------------------------
    Time: 1575355307000 ms
      -------------------------------------------
    ((iphone11,A),3)

    -------------------------------------------
    Time: 1575355308000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1, 1, 1))

    -------------------------------------------
    Time: 1575355308000 ms
      -------------------------------------------
    ((iphone11,A),3)

    -------------------------------------------
    Time: 1575355309000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1, 1, 1))

    -------------------------------------------
    Time: 1575355309000 ms
      -------------------------------------------
    ((iphone11,A),3)

    -------------------------------------------
    Time: 1575355310000 ms
      -------------------------------------------
    ((iphone11,A),ArrayBuffer(1, 1))

    -------------------------------------------
    Time: 1575355310000 ms
      -------------------------------------------
    ((iphone11,A),2)*/

    ds
      .map(line => {
        val arr = line.split(" ")
        val productName = arr(2)
        val categoryId = arr(4)
        ((productName, categoryId), 1)
      })
      .window(Seconds(60))
      .groupByKey()               //(iphone11,A)  [1,1]
      .map(t2 => ((t2._1._1, t2._1._2), t2._2.size)) //((iphone11, A), 2)
      //((羽绒服,B),2)
      //((mi9,A),1)
      .transform(rdd => {
      val groupByRDD = rdd.groupBy(_._1._2) //(A,[((iphone11, A), 2),((mi9,A),1)])

      val list = ListBuffer[(String, (String, Int))]()

      groupByRDD.foreach(t2 => {
        println(t2)
        val categoryId = t2._1
        val itar = t2._2
        var next = 0
        var pName = "" // (A,(iphone11,2))
        itar.foreach(n => {
          val productName = n._1._1
          val count = n._2
          if (count > next) {
            next = count
            pName = productName
          }
        })
        list.+=((categoryId, (pName, next)))
        println(list)
      })
      val newRDD = ssc.sparkContext.makeRDD(list)
      newRDD
    })
      .map(t2 => (t2._1, t2._2._1))  //DStream???
      .print()


    ssc.start()
    ssc.awaitTermination()

  }

}
package com.baizhi.mby00

import org.apache.spark.{SparkConf, SparkContext}


// 利用集合创建RDD --- 方式一
// ok
object CreateRDDWithCollectionParallelizeTest {


  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("create rdd with collection parallelize")
    val sc = new SparkContext(conf)

    val rdd = sc.parallelize(List("Hello Scala","Hello Spark","Hello Hello Hello"),2)


    // ok
   /* rdd
        .flatMap(_.split(" "))
        .foreach(println)*/
    /** Hello
    Scala
    Hello
    Spark
    Hello
    Hello
    Hello*/


    // ok
    /*rdd
        .flatMap(_.split(" "))
        .foreach(print)*/
   /** HelloHelloSparkScalaHelloHelloHello*/

    // ok
    /*rdd
        .flatMap(_.split(" "))
        .map((_,1))
        .foreach(println)*/
    /**(Hello,1)
    (Hello,1)
    (Scala,1)
    (Spark,1)
    (Hello,1)
    (Hello,1)
    (Hello,1)*/

    // ok
    /*rdd
        .flatMap(_.split(" "))
        .map((_,1))
        .foreach(print)*/
    /**(Hello,1)(Scala,1)(Hello,1)(Spark,1)(Hello,1)(Hello,1)(Hello,1)*/



    // ok
    /*rdd
        .flatMap(_.split(" "))
        .map((_,1))
        .groupByKey()
        .foreach(println)*/
    /**(Spark,CompactBuffer(1))
    (Hello,CompactBuffer(1, 1, 1, 1, 1))
    (Scala,CompactBuffer(1))*/

    // ok
   /* rdd
        .flatMap(_.split(" "))
        .map((_,1))
        .groupByKey()
        .foreach(print)*/
    /**(Spark,CompactBuffer(1))(Hello,CompactBuffer(1, 1, 1, 1, 1))(Scala,CompactBuffer(1))*/


    // ok
    /*rdd
        .flatMap(_.split(" "))
        .map((_,1))
        .groupByKey()
        .map(t2 => (t2._1,t2._2.size))
        .foreach(println)*/
    /**(Spark,1)
    (Hello,5)
    (Scala,1)*/


    // ok
    /*rdd
        .flatMap(_.split(" "))
        .map((_,1))
        .groupByKey()
        .map(t2 =>{
          val word = t2._1
          val  count = t2._2.size
          (word,word,count)
        })
        .foreach(println)*/
    /**(Hello,Hello,5)
    (Spark,Spark,1)
    (Scala,Scala,1)*/

    sc.stop()
  }

}
package com.baizhi.mby00

import org.apache.spark.{SparkConf, SparkContext}


// 利用集合创建RDD --- 方式二
// ok
object CreateRDDWithCollectionMakeRDDTest {


  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("create RDD with collection makeRDD test")
    val ss = new SparkContext(conf)

    val rdd = ss.makeRDD(List("Hello Spark","Hello Hadoop","Hello Hello Hello"))

    rdd
      .flatMap(_.split(" "))
      .map((_,1))
      .groupByKey()
      .map(t =>{
        val word = t._1
        val count = t._2.size
        (word,word,count,count)
      })
      .foreach(println)

    /**(Spark,Spark,1,1)
    (Hello,Hello,5,5)
    (Hadoop,Hadoop,1,1)*/

    ss.stop()


  }
}

```java

```java
package mby00

import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ListBuffer


// 对源RDD的元素应用函数操作 返回一个新的RDD
// ok
object MapTest {


  // main
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName("map test")
    val sc = new SparkContext(conf)

    val sourceRDD = sc.makeRDD(ListBuffer("a","b","c"))

    sourceRDD
      .map(str =>(str,str))
      .foreach(println)
   /** (c,c)
    (b,b)
    (a,a)*/


    sourceRDD
        .map(str =>{
          (str,1)
        })
        .foreach(println)
    /**(a,1)
    (c,1)
    (b,1)*/
    

    sc.stop()
  }
}


```java
首先要明白split方法的参数含义:
split
public String[] split(String regex)根据给定的正则表达式的匹配来拆分此字符串。
然后就要明确正则表达式的含义了:
\\s表示 空格,回车,换行等空白符,
+号表示一个或多个的意思,所以
```java
package mby00

import org.apache.spark.{SparkConf, SparkContext}


// 将源RDD的元素展开为0~n个元素 返回一个新的RDD集合
// 测试方法
// 直接点击main运行
// ok
object FlatMapTest {


  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("flatMap test")
    val sc = new SparkContext(conf)


    val sourceRDD = sc.parallelize(Vector("Hello Scala","Hello Hello Hello"))



    sourceRDD
        .flatMap(line => line.split(" "))
        .foreach(println)
    /**Hello
    Scala
    Hello
    Hello
    Hello*/


    println("LLLLLLLLLLL")
    sourceRDD
        .flatMap(_.split("\\s+"))
        .foreach(println)
    /**Hello
    Scala
    Hello
    Hello
    Hello*/


    sourceRDD
        .flatMap(line =>{
          line.split("\\s+")
        })
        .foreach(println)
    /**Hello
    Scala
    Hello
    Hello
    Hello*/

    sc.stop()

  }
}
package mby00

import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ListBuffer

// 对源RDD的每一分区做函数应用操作 返回一个新的RDD
// 怎样测试
// 直接点击main运行
// ok
object MapPartitionsTest {

  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("mapPartitions test")
    val sc = new SparkContext(conf)

    val sourceRDD = sc.parallelize(Vector("Hello Scala","Hello Hello Hello"),2)

    sourceRDD
        .mapPartitions(iter => {
          var lb = new ListBuffer[(String,Int)]()
          while (iter.hasNext){ // 当前分区的数据迭代器
            val line = iter.next()
            lb.+=((line,1))
          }
          lb.iterator
        })
        .foreach(println)
    /**(Hello Hello Hello,1)
    (Hello Scala,1)*/
    
    
    sc.stop()



  }

}
package mby00

import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ListBuffer

// 对源RDD的每一分区应用函数操作 注意携带分区数 返回一个新的RDD
// 怎样测试
// 直接点击main运行
// ok
object MapPartitionsWithIndexTest {


  // mian
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName("mapPartitionsWithIndex test")
    val sc = new SparkContext(conf)

    val sourceRDD = sc.makeRDD(Vector("Hello Scala","Hello Hello Hello"))

    println("1111111111111111111111111111111")
    sourceRDD
        .mapPartitionsWithIndex((index,iterator) =>{
          val lb = new ListBuffer[(String,Int)]()
          while (iterator.hasNext){
            val line = iterator.next()
            lb.+=((line,index)) // 对源RDD的每一个分区应用函数操作 返回(line,分区)
          }
          lb.iterator
        })
        .foreach(println)

    println("22222222222222222")
    val sourceRDD1 = sc.parallelize(Vector("Hello Scala","Hello Hello Hello","Hadoop"),2)
    sourceRDD1
        .mapPartitionsWithIndex((index,it) =>{
          val lb = new ListBuffer[(String,Int)]()
            while (it.hasNext){
              val line = it.next()
              lb.+=((line,index))
            }
          lb.iterator
        })
        .foreach(println)
    // numSlices: 2
    /**(Hello Scala,0)
    (Hello Hello Hello,1)
    (Hadoop,1)*/

    println("3333333333333333333333333333333333")
    val sourceRDD2 = sc.parallelize(Vector("Hello Scala","Hello Hello Hello","Hadoop"),3)
    sourceRDD2
        .mapPartitionsWithIndex((index,it) =>{
          val lb = new ListBuffer[(String,Int)]()
          while (it.hasNext){
            val line = it.next()
            lb.+=((line,index))
          }
          lb.iterator
        })
        .foreach(println)
    /**(Hello Hello Hello,1)
    (Hadoop,2)
    (Hello Scala,0)*/

    println("444444444444444444444444444444")
    val sourceRDD4 = sc.parallelize(Vector("Hello Scala","Hello Hello Hello","Hadoop"),5)
    sourceRDD4
        .mapPartitionsWithIndex((index,it) =>{
          val lb = new ListBuffer[(String,Int)]()
          while (it.hasNext){
            val line = it.next()
            lb.+=((line,index))
          }
          lb.iterator
        })
        .foreach(println)
    /**(Hello Scala,1)
    (Hello Hello Hello,3)
    (Hadoop,4)*/



    sc.stop()

  }
}
package mby00

import org.apache.spark.{SparkConf, SparkContext}

// 
// 怎样测试
// 直接点击main
// ok
object SampleTest {


  // mian
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("sample test")
    val sc = new SparkContext(conf)

    val sourceRDD = sc.makeRDD(List(1,2,3,4,5,6,7))

    println("111111111111111111111111")
    // withReplacement -- 表示数据是否可以重复
    // fraction:分数 --- 表示数据被抽中的概率
    // seed;种子 --- 底层用于产生随机数
    sourceRDD
        .sample(false,0.5D) // 表示数据不允许重复 每一个元素被抽中的概率为0.5
        .foreach(println)
    /**2
    3
    6
    7
    5*/

    println("2222222222222222222222222222")
    sourceRDD
        .sample(true,1.8) // 表示数据允许重复 每一个数据可以被选中的次数
        .foreach(println)
   /** 6
    4
    1
    2
    3
    3
    3
    7
    7*/



    sc.stop()
  }
}
package mby00

import org.apache.spark.{SparkConf, SparkContext}


// 将源RDD的内容和新RDD的内容进行合并 返回一个新的RDD
// 怎样测试
// 直接点击main
// ok
object UnionTest {

  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName(" union test")
    val sc = new SparkContext(conf)

    val sourceRRD = sc.makeRDD(List(1,2,3,4,5,6,7),1)
    val rDD = sc.makeRDD(List(5,6,7,8,9))


    println("111111111111111111111")
    sourceRRD
        .union(rDD)
        .sortBy(e => e,true,1)
        .foreach(println)
    /**1
    2
    3
    4
    5
    5
    6
    6
    7
    7
    8
    9*/


    println("2222222222222222222222")
    sourceRRD
        .union(rDD)
        .foreach(println)
    /**1
    2
    3
    4
    5
    6
    7
    5
    7
    6
    8
    9*/





    sc.stop()
  }
}
package mby00

import org.apache.spark.{SparkConf, SparkContext}


// 将源RDD的内容与新RDD的内容取交集 返回一个新的RDD
// 怎样测试
// 直接main运行
//
object IntersectionTest {

  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("intersection test")
    val sc = new SparkContext(conf)

    val sourceRDD = sc.makeRDD(List(1,2,3,4))
    val rDD = sc.makeRDD(List(1,2))


    println("111111111111111111111111111")
    sourceRDD
        .intersection(rDD)
        .foreach(println)
    /**2
    1*/

    sourceRDD
        .intersection(rDD)
        .sortBy(e => e,true,1)
        .foreach(println)
   /** 1
    2*/


    sc.stop()
  }
}
package mby00

import org.apache.spark.{SparkConf, SparkContext}


// 对源RDD的内容进行去重 返回一个新的RDD
// 怎样测试
// 直接main
// ok
object DistinctTest {

  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName(" distinct test")
    val sc = new SparkContext(conf)

    val sourceRdd = sc.makeRDD(List(1,2,3,4,5,6,6,7,8),2)

    println("11111111111111111111")
    sourceRdd
      .distinct()
      .foreach(println)
    /**1
    3
    7
    5
    4
    6
    8
    2*/


    println("22222222222222222222222222")
    sourceRdd
        .distinct(1)
        .sortBy(e => e,true,1)
        .foreach(println)
    /**1
    2
    3
    4
    5
    6
    7
    8*/



    sc.stop()
  }

}
package mby00

import org.apache.spark.{SparkConf, SparkContext}

//
//
//
//
object GroupByKeyTest {

  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("groupByKey test")
    val sc = new SparkContext(conf)

    
    
    println("111111111111111")
    // 对源RDD的(K,V)进行调整 并且返回K相同的(K,Iterator<V>)
    // 如果不设定分区的数量 则子RDD的分区的数量和父RDD的分区的数量一致
    // 如果以设定分区数量 则以设定的分区数量优先
    val sourceRDD = sc.makeRDD(List(("Hello",1),("Scala",1),("Hello",1),("Spark",1)),2)
    sourceRDD
        .groupByKey(4) // 宽依赖的算子
        .foreach(println)

    /**(Hello,CompactBuffer(1, 1))
    (Spark,CompactBuffer(1))
    (Scala,CompactBuffer(1))*/



    sc.stop()
  }
}
package mby00

import org.apache.spark.{Partitioner, SparkConf, SparkContext}

//
//
//
//
object ReduceByKeyTest {

  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("reduceByKey test")
    val sc = new SparkContext(conf)



    val sourceRDD = sc.makeRDD(List(("Hello",1),("Scala",1),("Hello",1),("Spark",1)),5)


    println("1111111111111111111111")
    // 对源RDD(K,V)进行调整 返回一个新的RDD(K,V)
    // 根据key的values进行给定的函数聚合操作
    // 方式1
    val rdd = sourceRDD
        .reduceByKey(
          new Partitioner { // 自定义分区规则
            override def numPartitions: Int = 3
            override def getPartition(key: Any): Int = {
              val k = key.asInstanceOf[String]
              if(k.startsWith("H")) 0
              else if(k.startsWith("S")) 1
              else 2
            }
          },
          (v1,v2) => v1 + v2
        )
        // .foreach(println)
    /**(Spark,1)
    (Scala,1)
    (Hello,2)*/
    println(rdd.getNumPartitions) // 3 --- 获取分区的数量


    println("2222222222222222222")
    // 方式2
    sourceRDD
        .reduceByKey(_+_,2)
        .foreach(println)
    /**(Spark,1)
    (Hello,2)
    (Scala,1)*/

    println("33333333333333333333333333")
    // 方式3
    sourceRDD
        .reduceByKey(_+_)
        .foreach(println)
    /**(Spark,1)
    (Hello,2)
    (Scala,1)*/

    println("44444444444444444")
    val rdd1 = sourceRDD
        .reduceByKey(_+_)
    println(rdd1.getNumPartitions) // 5


    sc.stop()
  }


}
package mby00

import org.apache.spark.{SparkConf, SparkContext}

// 根据key进行聚合操作 ---  首先进行分区的计算,再进行分区间的计算
// 测试
// 直接main
//
object AggregateByKeyTest {

  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("aggregate by key test")
    val sc = new SparkContext(conf)

    val sourceRDD = sc.makeRDD(List(("Hello",1),("Hello",1),("Spark",1),("Scala",1)),2)

    sourceRDD
        .aggregateByKey(1)(
          (zeroValue,default) => zeroValue + default, // 分区内的聚合操作
          (p1,p2) => p1 + p2 // 不同分区间的集合操作
        )
        .foreach(println)
    /**(Spark,2)
    (Hello,3)
    (Scala,2)*/

    val sourceRDD13 = sc.makeRDD(List(("Hello", 1), ("Spark", 1), ("Hello", 1), ("Scala", 1)), 2)
    sourceRDD13
        .aggregateByKey(1)(
          (zeroValue,default) => zeroValue + default,
          (p1,p2) => p1 + p2
        )
        .foreach(println)
    /**(Spark,2)
    (Hello,4)
    (Scala,2)*/
      

    sc.stop()
  }
}
package mby00

import org.apache.spark.{SparkConf, SparkContext}

// 对源RDD调用 根据k进行排序 返回一个新的RDD
// 对源RDD(k,v)调用,根据k进行排序返回一个新的RDD
// 测试方法
// 直接main
// ok
object SortByKeyTest {

  // maim
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("sortByKey test")
    val sc = new SparkContext(conf)

    val sourceRDD = sc.parallelize(List(("b",1),("a",1),("c",1),("a",1)),2)



    println("1111111111111111111111111111")
    sourceRDD
        .sortByKey(false,1) // 排序规则和排序后的分区数量
        .foreach(println)
   /**(c,1)
    (b,1)
    (a,1)
    (a,1)*/




    println("2222222222222222222222222222")
    sourceRDD
        .sortByKey(true,1)
        .foreach(println)
    /**(a,1)
    (a,1)
    (b,1)
    (c,1)*/




    println("33333333333333333333333")
    sourceRDD
        .sortBy(t2 => t2._1,true,1)
        .foreach(println)
    /**(a,1)
    (a,1)
    (b,1)
    (c,1)*/



    println("4444444444444444444")
    sourceRDD
        .sortBy(t2 => t2._1,false,1)
        .foreach(println)
    /**(c,1)
    (b,1)
    (a,1)
    (a,1)*/


    sc.stop()
  }
}
package mby00

import org.apache.spark.{SparkConf, SparkContext}

//
//
//
//
object JoinTest {


  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("join test")
    val sc = new SparkContext(conf)

    val sourceRDD1 = sc.parallelize(List((1,"zs"),(2,"ls"),(3,"ww"),(4,"zl")),2)
    val sourceRDD2 = sc.parallelize(List((1,10),(2,20),(3,30),(5,50)),2)


    println("111111111111111111111111")
    sourceRDD1
        .join(sourceRDD2,2) // 相当于内连接
        .foreach(t2 =>println(t2._1 +"\t" + t2._2._1 + "\t" + t2._2._2) )
    /**2	ls	20
    1	zs	10
    3	ww	30*/






    println("22222222222222222222")
    sourceRDD1
        .join(sourceRDD2,2) // 相当于内连接
        .foreach(println)
    /**(1,(zs,10))
    (3,(ww,30))
    (2,(ls,20))*/






    println("3333333333333333333333")
    sourceRDD1
        .leftOuterJoin(sourceRDD2,2) // 左外连接
        .foreach(println)
    /**(4,(zl,None))
    (2,(ls,Some(20)))
    (1,(zs,Some(10)))
    (3,(ww,Some(30)))*/

    println("4444444444444444444444444444")
    sourceRDD1
        .leftOuterJoin(sourceRDD2,2)
        .foreach(t2 =>println(t2._1 + "\t" + t2._2._1 + "\t" + t2._2._2))
    /**4	zl	None
    2	ls	Some(20)
    1	zs	Some(10)
    3	ww	Some(30)*/



    println("55555555555555555555555555555555555")
    sourceRDD1
        .rightOuterJoin(sourceRDD2,2) // 相当于右外连接
        .foreach(println)
    /**(2,(Some(ls),20))
    (1,(Some(zs),10))
    (3,(Some(ww),30))
    (5,(None,50))*/


    println("666666666666666666666666666666")
    sourceRDD1
        .rightOuterJoin(sourceRDD2,2)
        .foreach(t2 => println(t2._1 + "\t" + t2._2._1 + "\t" + t2._2._2))
    /**1	Some(zs)	10
    3	Some(ww)	30
    5	None	50
    2	Some(ls)	20*/

    sc.stop()
  }
}
package mby00

import org.apache.spark.{SparkConf, SparkContext}

// cogroup**(*otherDataset*, [*numPartitions*]:
// 共同分组,两个RDD(K,V)和(K,W)进行共同分组,
// 返回一个新的RDD (K, (Iterable<V>, Iterable<W>))
//
//
// ok

object CogroupTest {

  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName(" cogroup test")
    val sc = new SparkContext(conf)

    val sourceRDD1 = sc.parallelize(List(("b",1),("a",1),("c",1),("a",1)),2)
    val sourceRDD2 = sc.parallelize(List(("e",1),("a",1),("c",1),("a",1)),2)


    println("11111111111111111111111111111111111111")
    sourceRDD1
        .cogroup(sourceRDD2)
        .foreach(println)
    /**(e,(CompactBuffer(),CompactBuffer(1)))
    (b,(CompactBuffer(1),CompactBuffer()))
    (a,(CompactBuffer(1, 1),CompactBuffer(1, 1)))
    (c,(CompactBuffer(1),CompactBuffer(1)))*/

    println("2222222222222222222222222222222222222222222")
    sourceRDD1
        .cogroup(sourceRDD2)
        .foreach(t2 =>println(t2._1 +"\t"+ t2._2._1 + "\t" + t2._2._2))
    /**b	CompactBuffer(1)	CompactBuffer()
    e	CompactBuffer()	CompactBuffer(1)
    a	CompactBuffer(1, 1)	CompactBuffer(1, 1)
    c	CompactBuffer(1)	CompactBuffer(1)*/

    sc.stop()
  }


}
package mby00

import org.apache.spark.{SparkConf, SparkContext}


//
//
//
//

object RePartitionTest {


  // main
  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setMaster("local[*]").setAppName("rePartition test")
    val sc = new SparkContext(conf)

    val sourceRDD = sc.makeRDD(List("a","b","c","d"),4)

   val rDD = sourceRDD
        .repartition(5)  // 父RDD 4 ---> 子RDD 2

    println(rDD.getNumPartitions) // 5

    rDD.foreach(println)
   /** a
    b
    c
    d*/
    sc.stop()
  }


}
package mby00

import org.apache.spark.{Partitioner, SparkConf, SparkContext}


// 重新分区,并对分区内的数据进行局部排序
//
//
// ok
object RepartitionAndSortWithinPartitons {

  // main
  def main(args: Array[String]): Unit = {
    val conf = new SparkConf().setMaster("local[*]").setAppName(" repartitionAndSortWithinPartitions")
    val sc = new SparkContext(conf)

    val sourceRDD = sc.parallelize(List(("b", 1), ("e", 2), ("a", 1), ("c", 1), ("a", 3), ("a", 2)), 2)

    sourceRDD
      .repartitionAndSortWithinPartitions(new Partitioner {
        override def numPartitions: Int = 4
        override def getPartition(key: Any): Int = {
          val k = key.asInstanceOf[String]
          if (k.startsWith("a") || k.startsWith("e")) 0
          else if (k.startsWith("b")) 1
          else if (k.startsWith("c")) 2
          else 3
        }
      })
      .foreach(println)
    /**(c,1)
    (a,1)
    (b,1)
    (a,3)
    (a,2)
    (e,2)*/

    sc.stop()
  }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值