RDD
distinct 全局去重 改变分区
/**
-
distinct 去除重复数据
-
是一个转换算子
-
全局去重 改变分区
*/
object DistinctDemo {
def main(args: Array[String]): Unit = {
val sc: SparkContext = SparkUtils.getSparkContext
val seq = Seq(1, 2, 4, 5, 66, 66, 6, 5, 77, 77)
//转换成RDD
val rdd: RDD[Int] = sc.parallelize(seq, 2)//是一个转换算子
val rdd2: RDD[Int] = rdd.distinct(3)println(rdd.partitions.size)
println(rdd2.partitions.size)
val arr: Array[Int] = rdd2.collect()
println(arr.toBuffer)
sc.stop()
}
}
groupBy
/**
-
group by 分组 返回[k,v]的RDD
-
处理[w] --> [k,iters]
-
处理[k,v] --> [k, iters]
*/
object GroupByDemo {
def main(args: Array[String]): Unit = {val sc = SparkUtils.getSparkContext
val seq: Seq[String] = Seq(“a”, “b”, “c”, “d”, “e”, “f”)
val rdd1: RDD[String] = sc.parallelize(seq, 2)//返回k Iterator
val rdd2: RDD[(String, Iterable[String])] = rdd1.groupBy(e => e) //元素的内容
val rdd3: RDD[(String, List[String])] = rdd2.map(e => {
val k = e._1
val iters: Iterable[String] = e._2
(k, iters.toList)
})
rdd3.foreach(println)sc.stop()
}
}
object Wc1 {
def main(args: Array[String]): Unit = {
val sc = SparkUtils.getSparkContext
val seq: Seq[String] = Seq("a", "b", "c", "d", "e", "f")
val rdd1: RDD[String] = sc.parallelize(seq, 2)
val rdd2: RDD[String] = rdd1.flatMap(_.split("\\s+"))
val rdd3: RDD[(String, Iterable[String])] = rdd2.groupBy(e => e)
val rdd4: RDD[(String, Int)] = rdd3.map(e => {
(e._1, e._2.size)
})
rdd4.foreach(println)
sc.stop()
}