package com.shujia.core
import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object Demo19ForeachPartitions {
def main(args: Array[String]): Unit = {
/**
* mapPartitions VS foreachPartitions
* 都是由于 连接不能被序列化 避免每条数据都去建立/销毁连接导致额外的开销
* 怎么选?
* 如果只是想从外部系统获取数据 选择 mapPartitions
* 如果只想将数据保存到外部系统 选择 foreachPartitions
*/
val conf: SparkConf = new SparkConf()
conf.setAppName("Demo19ForeachPartitions")
conf.setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val stuRDD: RDD[Student] = sc
.textFile("Spark/data/students.txt")
.map(line => {
val splits: Array[String] = line.split(",")
val id: String = splits(0)
val name: String = splits(1)
val age: Int = splits(2).toInt
val gender: String = splits(3)
val clazz: String = splits(4)
Student(id, name, age, gender, clazz)
})
//假设将数据保存到MySQL
stuRDD
//对每个分区进行操作 没有返回值
.foreachPartition(stuIter=>{
//直接打印 保存到MySQL的逻辑 类似mapPartitions
stuIter
.foreach(stu=>{
println(s"${stu.id},${stu.name}")
})
})
}
}
package com.shujia.core
import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object Demo20Cache {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
conf.setAppName("Demo20Cache")
conf.setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val stuRDD: RDD[String] = sc
.textFile("Spark/data/students.txt")
//统计班级人数
val clazzRDD: RDD[(String, Int)] = stuRDD
.map(line => (line.split(",")(4), 1))
val clazzCntRDD: RDD[(String, Int)] = clazzRDD
.reduceByKey(_ + _)
//打印
clazzCntRDD.foreach(println)
//统计性别人数
val genderRDD: RDD[(String, Int)] = stuRDD
.map(line => (line.split(",")(3), 1))
val genderCntRDD: RDD[(String, Int)] = genderRDD
.reduceByKey(_ + _)
//打印
genderCntRDD.foreach(println)
}
}
(理科二班,79)
(文科三班,94)
(理科四班,91)
(理科一班,78)
(文科五班,84)
(文科一班,72)
(文科四班,81)
(理科六班,92)
(理科三班,68)
(文科六班,104)
(理科五班,70)
(文科二班,87)(男,507)
(女,493)
package com.shujia.core
import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object Demo20Cache {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
conf.setAppName("Demo20Cache")
conf.setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val stuRDD: RDD[String] = sc
.textFile("Spark/data/students.txt")
val mapStuRDD: RDD[String] = stuRDD
.map(line => {
println("=====读取了Student数据=====")
line
})
//统计班级人数
val clazzRDD: RDD[(String, Int)] = mapStuRDD
.map(line => (line.split(",")(4), 1))
val clazzCntRDD: RDD[(String, Int)] = clazzRDD
.reduceByKey(_ + _)
//打印
clazzCntRDD.foreach(println)
//统计性别人数
val genderRDD: RDD[(String, Int)] = mapStuRDD
.map(line => (line.split(",")(3), 1))
val genderCntRDD: RDD[(String, Int)] = genderRDD
.reduceByKey(_ + _)
//打印
genderCntRDD.foreach(println)
}
}
package com.shujia.core
import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
object Demo20Cache {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
conf.setAppName("Demo20Cache")
conf.setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val stuRDD: RDD[String] = sc
.textFile("Spark/data/students.txt")
val mapStuRDD: RDD[String] = stuRDD
.map(line => {
println("=====读取了Student数据=====")
line
})
//对多次使用的RDD进行cache 将其缓存到内存中
mapStuRDD.cache()
//统计班级人数
val clazzRDD: RDD[(String, Int)] = mapStuRDD
.map(line => (line.split(",")(4), 1))
val clazzCntRDD: RDD[(String, Int)] = clazzRDD
.reduceByKey(_ + _)
//打印
clazzCntRDD.foreach(println)
//统计性别人数
val genderRDD: RDD[(String, Int)] = mapStuRDD
.map(line => (line.split(",")(3), 1))
val genderCntRDD: RDD[(String, Int)] = genderRDD
.reduceByKey(_ + _)
//打印
genderCntRDD.foreach(println)
}
}
package com.shujia.core
import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
object Demo20Cache {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
conf.setAppName("Demo20Cache")
conf.setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val stuRDD: RDD[String] = sc
.textFile("Spark/data/students.txt")
val mapStuRDD: RDD[String] = stuRDD
.map(line => {
println("=====读取了Student数据=====")
line
})
//对多次使用的RDD进行cache 默认会将其缓存到内存中
//mapStuRDD.cache()
//如果想要使用其他的缓存策略 需要适用persist()方法
//mapStuRDD.persist(StorageLevel.MEMORY_ONLY)
mapStuRDD.persist(StorageLevel.MEMORY_AND_DISK_SER)
//统计班级人数
val clazzRDD: RDD[(String, Int)] = mapStuRDD
.map(line => (line.split(",")(4), 1))
val clazzCntRDD: RDD[(String, Int)] = clazzRDD
.reduceByKey(_ + _)
//打印
clazzCntRDD.foreach(println)
//统计性别人数
val genderRDD: RDD[(String, Int)] = mapStuRDD
.map(line => (line.split(",")(3), 1))
val genderCntRDD: RDD[(String, Int)] = genderRDD
.reduceByKey(_ + _)
//打印
genderCntRDD.foreach(println)
}
}
package com.shujia.core
import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel
object Demo20Cache {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf()
conf.setAppName("Demo20Cache")
conf.setMaster("local")
val sc: SparkContext = new SparkContext(conf)
val stuRDD: RDD[String] = sc
.textFile("Spark/data/students.txt")
val mapStuRDD: RDD[String] = stuRDD
.map(line => {
println("=====读取了Student数据=====")
line
})
//对多次使用的RDD进行cache 默认会将其缓存到内存中
//mapStuRDD.cache()
//如果想要使用其他的缓存策略 需要适用persist()方法
//mapStuRDD.persist(StorageLevel.MEMORY_ONLY)
mapStuRDD.persist(StorageLevel.MEMORY_AND_DISK_SER)
//统计班级人数
val clazzRDD: RDD[(String, Int)] = mapStuRDD
.map(line => (line.split(",")(4), 1))
val clazzCntRDD: RDD[(String, Int)] = clazzRDD
.reduceByKey(_ + _)
//打印
clazzCntRDD.foreach(println)
//统计性别人数
val genderRDD: RDD[(String, Int)] = mapStuRDD
.map(line => (line.split(",")(3), 1))
val genderCntRDD: RDD[(String, Int)] = genderRDD
.reduceByKey(_ + _)
//打印
genderCntRDD.foreach(println)
//使用完了记得释放缓存的RDD
mapStuRDD.unpersist()
while(true){
}
}
}