一、Union合并两个RDD
package com.shujia.spark.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo13Union {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("union")
val sc = new SparkContext(conf)
val rdd1: RDD[Int] = sc.parallelize(List(1, 2, 3, 4, 5, 6))
val rdd2: RDD[Int] = sc.parallelize(List(3, 4, 5, 6, 7, 8, 9))
println(s"rdd1:${rdd1.getNumPartitions}")
println(s"rdd2:${rdd2.getNumPartitions}")
val unionRDD: RDD[Int] = rdd1.union(rdd2)
println(s"unionRDD:${unionRDD.getNumPartitions}")
unionRDD.foreach(println)
}
}
二、Join
1、innerJoin
package com.shujia.spark.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo14Join {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("union")
conf.setMaster("local")
val sc = new SparkContext(conf)
val idNameRDD: RDD[(String, String)] = sc.parallelize(List(
("001", "小伟"),
("002", "张三"),
("003", "刘思思"),
("004", "王五")
))
val idAgeRDD: RDD[(String, String)] = sc.parallelize(List(
("001", "23"),
("002", "24"),
("003", "25"),
("004", "23")
))
val innerJoinRDD: RDD[(String, (String, String))] = idNameRDD.join(idAgeRDD)
innerJoinRDD
.map {
case (id: String, (name: String, age: String)) =>
(id, name, age.toInt)
}
.foreach(println)
2、leftOuterJoin
val leftJoinRDD: RDD[(String, (String, Option[String]))] = idNameRDD.leftOuterJoin(idAgeRDD)
leftJoinRDD
.map {
case (id: String, (name: String, Some(age))) =>
(id, name, age.toInt)
case (id: String, (name: String, None)) =>
(id, name, 0)
}
.foreach(println)
3、fullOuterJoin
val fullJoinRDD: RDD[(String, (Option[String], Option[String]))] = idNameRDD.fullOuterJoin(idAgeRDD)
fullJoinRDD
.map {
case (id: String, (Some(name), None)) =>
(id, name, 0)
case (id: String, (None, Some(age))) =>
(id, "默认值", age)
case (id:String,(Some(name),Some(age)))=>
(id,name,age)
}
.foreach(println)
}
}
三、MapValues
package com.shujia.spark.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo15MapValues {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("union")
val sc = new SparkContext(conf)
val idAgeRDD: RDD[(String, Int)] = sc.parallelize(List(
("001", 23),
("002", 24),
("003", 25),
("004", 23)
))
val rdd: RDD[(String, Int)] = idAgeRDD.mapValues(age => age + 1)
rdd.foreach(println)
}
}
四、Sort
package com.shujia.spark.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo16Sort {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("group")
val sc = new SparkContext(conf)
val linesRDD: RDD[String] = sc.textFile("data/score.txt")
val scoreRDD: RDD[(String, Int)] = linesRDD
.map(line => line.split(","))
.filter(arr => arr.length == 3)
.map {
case Array(id: String, _: String, sco: String) =>
(id, sco.toInt)
}
val sumScoreRDD: RDD[(String, Int)] = scoreRDD.reduceByKey(_ + _)
val sortByRDD: RDD[(String, Int)] = sumScoreRDD.sortBy(kv => kv._2, ascending = false)
sortByRDD.foreach(println)
}
}
五、Distinct
package com.shujia.spark.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo17Distinct {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("group")
conf.setMaster("local")
val sc = new SparkContext(conf)
val rdd1: RDD[Int] = sc.parallelize(List(1, 1, 12, 5, 2, 2, 52, 41, 5, 3, 32, 4, 5))
val distinctRDD: RDD[Int] = rdd1.distinct()
distinctRDD.foreach(println)
}
}
七、RDDAction算子
package com.shujia.spark.core
import com.shujia.spark.util.HDFSUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo18Action {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setMaster("local")
conf.setAppName("group")
val sc = new SparkContext(conf)
val studentRDD: RDD[String] = sc.textFile("data/students.txt")
val count: Long = studentRDD.count()
println(count)
val sumAge: Double = studentRDD
.map(line => line.split(",")(2).toInt)
.sum()
println(sumAge /count)
val top10: Array[String] = studentRDD.take(10)
top10.foreach(println)
val array: Array[String] = studentRDD.collect()
array.foreach(println)
studentRDD.foreach(println)
studentRDD.foreachPartition((iter:Iterator[String])=>{
iter.foreach(println)
})
HDFSUtil.deletePath("data/test")
studentRDD.saveAsTextFile("data/test")
while (true){
}
}
}
八、StudentTest统计偏科最严重的前100名学生
package com.shujia.spark.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo19Student {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("student")
conf.setMaster("local")
val sc = new SparkContext(conf)
val scoresRDD: RDD[String] = sc.textFile("data/score.txt")
val kvRDD: RDD[(String, String, Double)] = scoresRDD
.map(_.split(","))
.filter(_.length == 3)
.map {
case Array(sId: String, cId: String, sco: String) =>
(sId, cId, sco.toDouble)
}
val groupByRDD: RDD[(String, Iterable[(String, String, Double)])] = kvRDD.groupBy(_._1)
val stdRDD: RDD[(String, Double)] = groupByRDD.map {
case (id: String, iter: Iterable[(String, String, Double)]) =>
val scores: List[Double] = iter.map(_._3).toList
val avg: Double = scores.sum / scores.length
val std: Double = scores.map(i => (i - avg) * (i - avg)).sum / scores.length
(id, std)
}
val sortByRDD: RDD[(String, Double)] = stdRDD.sortBy(_._2, ascending = false)
val top100: Array[(String, Double)] = sortByRDD.take(100)
top100.foreach(println)
}
}
九、Submit在集群中运行
package com.shujia.spark.core
import com.shujia.spark.util.HDFSUtil
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo20Submit {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("submit")
val sc = new SparkContext(conf)
val studentRDD: RDD[String] = sc.textFile("/data/students.txt")
val kvRDD: RDD[(String, Int)] = studentRDD
.map(line => {
val split: Array[String] = line.split(",")
(split(4), 1)
})
val classNumRDD: RDD[(String, Int)] = kvRDD.reduceByKey(_ + _)
val resultRDD: RDD[String] = classNumRDD
.map {
case (clazz: String, num: Int) =>
s"$clazz\t$num"
}
HDFSUtil.deletePath("/data/clazz_num")
resultRDD.saveAsTextFile("/data/clazz_num")
}
}
十、StudentTest统计每科都及格的学生
package com.shujia.spark.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo21Student {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("student")
conf.setMaster("local")
val sc = new SparkContext(conf)
val scoreRDD: RDD[String] = sc.textFile("data/score.txt")
val subjectRDD: RDD[String] = sc.textFile("data/subject.txt")
val scoreKVRDD: RDD[(String, (String, Int))] = scoreRDD
.map(_.split(","))
.filter(_.length == 3)
.map {
case Array(sid: String, cid: String, sco: String) =>
(cid, (sid, sco.toInt))
}
val subjectKVRDD: RDD[(String, Int)] = subjectRDD
.map(_.split(","))
.filter(_.length == 3)
.map {
case Array(cid: String, _: String, sumSco: String) =>
(cid, sumSco.toInt)
}
val joinRDD: RDD[(String, (Int, (String, Int)))] = subjectKVRDD.join(scoreKVRDD)
val comRDD: RDD[(String, String, Int, Int)] = joinRDD.map {
case (cid: String, (sumSco: Int, (sid: String, sco: Int))) =>
(sid, cid, sumSco, sco)
}
val filterRDD: RDD[(String, String, Int, Int)] = comRDD.filter {
case (_: String, _: String, sumSco: Int, sco: Int) =>
sco >= sumSco * 0.6
}
val groupByRDD: RDD[(String, Iterable[(String, String, Int, Int)])] = filterRDD.groupBy(_._1)
val resultRDD: RDD[(String, Iterable[(String, String, Int, Int)])] = groupByRDD.filter(_._2.size == 6)
val doujigeRDD: RDD[(String, String, Int, Int)] = resultRDD.flatMap(kv => kv._2)
doujigeRDD.foreach(println)
}
}
十一、用代码求PI
package com.shujia.spark.core
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import scala.util.Random
object Demo22PI {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("pi")
conf.setMaster("local[8]")
val sc = new SparkContext(conf)
val list: Range = 0 until 1000000
val length: Int = list.length
val listRDD: RDD[Int] = sc.parallelize(list, 8)
val pointsRDD: RDD[(Double, Double)] = listRDD.map(i => {
val x: Double = Random.nextDouble() * 2 - 1
val y: Double = Random.nextDouble() * 2 - 1
(x, y)
})
val yuanPointRDD: RDD[(Double, Double)] = pointsRDD.filter {
case (x: Double, y: Double) =>
(x * x + y * y) < 1
}
val PI: Double = yuanPointRDD.count().toDouble / length * 4.0
println(s"PI is :$PI")
}
}
十二、Acc累加器
package com.shujia.spark.core
import org.apache.spark.rdd.RDD
import org.apache.spark.util.LongAccumulator
import org.apache.spark.{SparkConf, SparkContext}
object Demo23Acc {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("acc")
conf.setMaster("local")
val sc = new SparkContext(conf)
val studentRDD: RDD[String] = sc.textFile("data/students.txt",2)
var count = 0
studentRDD.foreach(stu=>{
count+=1
println(count)
})
println(count)
val countAcc: LongAccumulator = sc.longAccumulator
studentRDD.foreach(stu =>{
countAcc.add(1)
})
println(countAcc.value)
while (true){
}
}
}
十三、Bro广播函数
package com.shujia.spark.core
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object Demo24Bro {
def main(args: Array[String]): Unit = {
val conf = new SparkConf()
conf.setAppName("bro")
conf.setMaster("local")
val sc = new SparkContext(conf)
val studentRDD: RDD[String] = sc.textFile("data/students.txt")
val scoreRDD: RDD[String] = sc.textFile("data/score.txt")
val studentArray: Array[String] = studentRDD.collect()
val kvStuRDD: Array[(String, String)] = studentArray.map(stu => {
val split: Array[String] = stu.split(",")
val id: String = split(0)
(id, stu)
})
val stuMap: Map[String, String] = kvStuRDD.toMap
val stuMapBro: Broadcast[Map[String, String]] = sc.broadcast(stuMap)
val joinRDD: RDD[(String, String)] = scoreRDD.map(sco => {
val split: Array[String] = sco.split(",")
val id: String = split(0)
val broValue: Map[String, String] = stuMapBro.value
val stuInfo: String = broValue.getOrElse(id, "默认值")
(stuInfo, sco)
})
joinRDD.foreach(println)
while (true) {
}
}
}