object RDDDemo {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("Map算子").setMaster("local[2]")
val sc: SparkContext = new SparkContext(sparkConf)
mapOper(sc)
filterOper(sc)
flatMapOper(sc)
mapPartitionsOper(sc)
mapPartitionsWithIndexOper(sc)
sc.stop()
}
def mapOper(sc: SparkContext): Unit = {
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0), 1)
val rdd1: RDD[Int] = rdd.map((a: Int) => {
a * 3
})
rdd1.foreach(println(_))
val rdd2: RDD[String] = sc.makeRDD(Array("hadoop", "scala", "flink", "hadoop"))
val rdd3: RDD[(String, Int)] = rdd2.map((word: String) => {
(word, 1)
})
rdd3.foreach(println(_))
}
def filterOper(sc: SparkContext): Unit = {
val rdd: RDD[Int] = sc.makeRDD(1 to 100)
val rdd1: RDD[Int] = rdd.filter((a: Int) => {
if (a % 2 == 1) {
true
} else {
false
}
})
rdd1.foreach(println(_))
}
def flatMapOper(sc: SparkContext): Unit = {
val rdd: RDD[String] = sc.makeRDD(Array("i am people", "dog is an animal"))
val rdd1: RDD[String] = rdd.flatMap((line: String) => {
line.split(" ")
})
println("============flatMap============")
rdd1.foreach(println(_))
}
def mapPartitionsOper(sc: SparkContext): Unit = {
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0), 4)
val rdd1: RDD[Int] = rdd.mapPartitions((a: Iterator[Int]) => {
var listBuffer: ListBuffer[Int] = ListBuffer()
for (num <- a) {
println(s"处理了一个分区数据$num")
listBuffer.append(num * 3)
}
listBuffer.iterator
})
rdd1.foreach(println(_))
}
def mapPartitionsWithIndexOper(sc: SparkContext): Unit = {
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0), 4)
val rdd1: RDD[Int] = rdd.mapPartitionsWithIndex((index: Int, data: Iterator[Int]) => {
val listBuffer: ListBuffer[Int] = ListBuffer()
for (num <- data) {
println(s"正在处理$index 分区的数据$num")
listBuffer.append(num * 3)
}
listBuffer.iterator
})
rdd1.foreach(println(_))
}
}
object RDDDemo01 {
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("RDDDemo01").setMaster("local[2]")
val sc: SparkContext = new SparkContext(sparkConf)
sampleOper(sc)
unionOper(sc)
intersectionOper(sc)
repartitionOper(sc)
sc.stop()
}
/**
* sample算子和takeSample算子
*
* def sample(
withReplacement: Boolean,
fraction: Double,
seed: Long = Utils.random.nextLong): RDD[T] ={}
* 第一个参数withReplacement:是否放回式抽样
* 第二个参数fraction:抽取的比率
* 第三个参数seed:抽样算法的初始值
*
* 区别:和Sample的区别是:takeSample返回的是最终的结果集合
*/
def sampleOper(sc: SparkContext): Unit = {
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3), 4)
val rdd2: RDD[Int] = rdd.mapPartitionsWithIndex((index: Int, data: Iterator[Int]) => {
println(s"sample之前有一个$index 数据为${data.mkString(",")}")
data
})
val rdd1: RDD[Int] = rdd.sample(false, 0.5)
val rdd3: RDD[Int] = rdd1.mapPartitionsWithIndex((index: Int, data: Iterator[Int]) => {
println(s"sample之后有一个$index 数据为${data.mkString(",")}")
data
})
// takeSample算子
val array2:Array[Int] = rdd.takeSample(false, 10)
println(array2.mkString("::"))
rdd2.collect()
rdd3.collect()
}
/**
* union——并集,不去重
*/
def unionOper(sc: SparkContext): Unit = {
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 8, 9))
val rdd1: RDD[Int] = sc.makeRDD(Array(11, 12, 13, 14))
val rdd2: RDD[Int] = rdd.union(rdd1)
rdd2.foreach(print(_))
}
/**
* intersection算子——交集
*/
def intersectionOper(sc: SparkContext): Unit = {
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 8, 9))
val rdd1: RDD[Int] = sc.makeRDD(Array(1, 12, 3, 14))
val rdd2: RDD[Int] = rdd.intersection(rdd1)
rdd2.foreach(print(_))
}
/**
* distinct算子——对RDD去重
*/
def distinctOper(sc: SparkContext): Unit = {
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 3, 9))
val rdd1: RDD[Int] = rdd.distinct(3)
rdd1.foreach(println(_))
}
/**
* repartition——对RDD重新分区
*/
def repartitionOper(sc: SparkContext): Unit = {
val rdd: RDD[Int] = sc.makeRDD(Array(1, 2, 3, 4, 5, 6, 7, 3, 9), 10)
println("重新分区前的分区数:" + rdd.getNumPartitions)
val rdd1: RDD[Int] = rdd.repartition(2)
println("重新分区后的分区数:" + rdd1.getNumPartitions)
}
}