缓存cache

package com.shujia.core

import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Demo19ForeachPartitions {
  def main(args: Array[String]): Unit = {
    /**
     * mapPartitions VS foreachPartitions
     * 都是由于 连接不能被序列化 避免每条数据都去建立/销毁连接导致额外的开销
     * 怎么选?
     * 如果只是想从外部系统获取数据 选择 mapPartitions
     * 如果只想将数据保存到外部系统 选择 foreachPartitions
     */

    val conf: SparkConf = new SparkConf()
    conf.setAppName("Demo19ForeachPartitions")
    conf.setMaster("local")

    val sc: SparkContext = new SparkContext(conf)

    val stuRDD: RDD[Student] = sc
      .textFile("Spark/data/students.txt")
      .map(line => {
        val splits: Array[String] = line.split(",")
        val id: String = splits(0)
        val name: String = splits(1)
        val age: Int = splits(2).toInt
        val gender: String = splits(3)
        val clazz: String = splits(4)
        Student(id, name, age, gender, clazz)
      })

    //假设将数据保存到MySQL
    stuRDD
      //对每个分区进行操作 没有返回值
      .foreachPartition(stuIter=>{
        //直接打印 保存到MySQL的逻辑 类似mapPartitions
        stuIter
          .foreach(stu=>{
            println(s"${stu.id},${stu.name}")
          })
      })
  }
}
package com.shujia.core

import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Demo20Cache {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    conf.setAppName("Demo20Cache")
    conf.setMaster("local")

    val sc: SparkContext = new SparkContext(conf)

    val stuRDD: RDD[String] = sc
      .textFile("Spark/data/students.txt")

    //统计班级人数
    val clazzRDD: RDD[(String, Int)] = stuRDD
      .map(line => (line.split(",")(4), 1))

    val clazzCntRDD: RDD[(String, Int)] = clazzRDD
      .reduceByKey(_ + _)

    //打印
    clazzCntRDD.foreach(println)

    //统计性别人数
    val genderRDD: RDD[(String, Int)] = stuRDD
      .map(line => (line.split(",")(3), 1))

    val genderCntRDD: RDD[(String, Int)] = genderRDD
      .reduceByKey(_ + _)

    //打印
   genderCntRDD.foreach(println)
  }
}

(理科二班,79)
(文科三班,94)
(理科四班,91)
(理科一班,78)
(文科五班,84)
(文科一班,72)
(文科四班,81)
(理科六班,92)
(理科三班,68)
(文科六班,104)
(理科五班,70)
(文科二班,87)

(男,507)
(女,493)

package com.shujia.core

import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Demo20Cache {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    conf.setAppName("Demo20Cache")
    conf.setMaster("local")

    val sc: SparkContext = new SparkContext(conf)

    val stuRDD: RDD[String] = sc
      .textFile("Spark/data/students.txt")

    val mapStuRDD: RDD[String] = stuRDD
      .map(line => {
        println("=====读取了Student数据=====")
        line
      })

    //统计班级人数
    val clazzRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(4), 1))

    val clazzCntRDD: RDD[(String, Int)] = clazzRDD
      .reduceByKey(_ + _)

    //打印
    clazzCntRDD.foreach(println)

    //统计性别人数
    val genderRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(3), 1))

    val genderCntRDD: RDD[(String, Int)] = genderRDD
      .reduceByKey(_ + _)

    //打印
   genderCntRDD.foreach(println)
  }
}
package com.shujia.core

import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Demo20Cache {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    conf.setAppName("Demo20Cache")
    conf.setMaster("local")

    val sc: SparkContext = new SparkContext(conf)

    val stuRDD: RDD[String] = sc
      .textFile("Spark/data/students.txt")

    val mapStuRDD: RDD[String] = stuRDD
      .map(line => {
        println("=====读取了Student数据=====")
        line
      })

    //对多次使用的RDD进行cache 将其缓存到内存中
    mapStuRDD.cache()

    //统计班级人数
    val clazzRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(4), 1))

    val clazzCntRDD: RDD[(String, Int)] = clazzRDD
      .reduceByKey(_ + _)

    //打印
    clazzCntRDD.foreach(println)

    //统计性别人数
    val genderRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(3), 1))

    val genderCntRDD: RDD[(String, Int)] = genderRDD
      .reduceByKey(_ + _)

    //打印
   genderCntRDD.foreach(println)
  }
}
package com.shujia.core

import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

object Demo20Cache {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    conf.setAppName("Demo20Cache")
    conf.setMaster("local")

    val sc: SparkContext = new SparkContext(conf)

    val stuRDD: RDD[String] = sc
      .textFile("Spark/data/students.txt")

    val mapStuRDD: RDD[String] = stuRDD
      .map(line => {
        println("=====读取了Student数据=====")
        line
      })

    //对多次使用的RDD进行cache 默认会将其缓存到内存中
    //mapStuRDD.cache()

    //如果想要使用其他的缓存策略 需要适用persist()方法
    //mapStuRDD.persist(StorageLevel.MEMORY_ONLY)
    mapStuRDD.persist(StorageLevel.MEMORY_AND_DISK_SER)

    //统计班级人数
    val clazzRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(4), 1))

    val clazzCntRDD: RDD[(String, Int)] = clazzRDD
      .reduceByKey(_ + _)

    //打印
    clazzCntRDD.foreach(println)

    //统计性别人数
    val genderRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(3), 1))

    val genderCntRDD: RDD[(String, Int)] = genderRDD
      .reduceByKey(_ + _)

    //打印
   genderCntRDD.foreach(println)

  }
}
package com.shujia.core

import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

object Demo20Cache {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    conf.setAppName("Demo20Cache")
    conf.setMaster("local")

    val sc: SparkContext = new SparkContext(conf)

    val stuRDD: RDD[String] = sc
      .textFile("Spark/data/students.txt")

    val mapStuRDD: RDD[String] = stuRDD
      .map(line => {
        println("=====读取了Student数据=====")
        line
      })

    //对多次使用的RDD进行cache 默认会将其缓存到内存中
    //mapStuRDD.cache()

    //如果想要使用其他的缓存策略 需要适用persist()方法
    //mapStuRDD.persist(StorageLevel.MEMORY_ONLY)
    mapStuRDD.persist(StorageLevel.MEMORY_AND_DISK_SER)

    //统计班级人数
    val clazzRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(4), 1))

    val clazzCntRDD: RDD[(String, Int)] = clazzRDD
      .reduceByKey(_ + _)

    //打印
    clazzCntRDD.foreach(println)

    //统计性别人数
    val genderRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(3), 1))

    val genderCntRDD: RDD[(String, Int)] = genderRDD
      .reduceByKey(_ + _)

    //打印
   genderCntRDD.foreach(println)

    //使用完了记得释放缓存的RDD
    mapStuRDD.unpersist()

    while(true){

    }
  }
}

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值