import Utils.SparkUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
object ReduceByKeyDemo {
def main(args: Array[String]): Unit = {
val sc: SparkContext =SparkUtils.getSparkContext()
val tf: RDD[String] =sc.textFile("data/cc.txt")
val fp: RDD[String] =tf.flatMap(_.split("\\s+"))
val mp: RDD[(String, Int)] =fp.map((_,1))
// mp.foreach(println)
/* (2,cc,19,sing,1)
(3,xx,20,cat,1)
(2,cc,19,sing,1)
(1,wnn,18,dangce,1)
(2,cc,19,sing,1)
(3,xx,20,cat,1)
(4,ny,21,cook,1)*/
/* (2,cc,19,sing,1)
(3,xx,20,cat,1)
(2,cc,19,sing,1)
(1,wnn,18,dangce,1)
(2,cc,19,sing,1)
(3,xx,20,cat,1)
(4,ny,21,cook,1)*/
val rbk: RDD[(String, Int)] =mp.reduceByKey(_+_)
rbk.foreach(println)
/* (4,ny,21,cook,1)
(3,xx,20,cat,2)
(1,wnn,18,dangce,1)
(2,cc,19,sing,3)
*/
}
}
import Utils.SparkUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
object DistinctDemo {
def main(args: Array[String]): Unit = {
val sc: SparkContext =SparkUtils.getSparkContext()
val rdd: RDD[Int] =sc.makeRDD(List(1,2,3,4,5,3,2,4),2)
//val rdd1: Array[Int] =rdd.collect()
//rdd1.foreach(println)
/* 1
2
3
4
5*/
//val set: Set[Int] =rdd1.toSet
//set.foreach(println)
/* 5
1
2
3
4*/
//val gb: RDD[(Int, Iterable[Int])] =rdd.groupBy(e=>e)
//gb.foreach(println)
/* (1,CompactBuffer(1))
(4,CompactBuffer(4))
(2,CompactBuffer(2))
(3,CompactBuffer(3))
(5,CompactBuffer(5))*/
// val mp: RDD[Int] = gb.map(_._1)
// mp.foreach(println)
val kb: RDD[(Int, Int)] =rdd.keyBy(e=>e)
// kb.foreach(println)
/* (5,5)
(3,3)
(2,2)
(4,4)
(1,1)
(2,2)
(3,3)
(4,4)*/
val rbk: RDD[(Int, Int)] =kb.reduceByKey((e, _)=>e)
rbk.foreach(println)
}
}