缓存cache

最新推荐文章于 2023-05-25 20:17:29 发布

小胖超凶哦！

最新推荐文章于 2023-05-25 20:17:29 发布

阅读量321

点赞数

分类专栏：初学大数据 Spark基础文章标签： Spark

本文链接：https://blog.csdn.net/ZZJXP/article/details/125322726

版权

初学大数据同时被 2 个专栏收录

158 篇文章 1 订阅

订阅专栏

Spark基础

27 篇文章 0 订阅

订阅专栏

package com.shujia.core

import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Demo19ForeachPartitions {
  def main(args: Array[String]): Unit = {
    /**
     * mapPartitions VS foreachPartitions
     * 都是由于 连接不能被序列化 避免每条数据都去建立/销毁连接导致额外的开销
     * 怎么选？
     * 如果只是想从外部系统获取数据 选择 mapPartitions
     * 如果只想将数据保存到外部系统 选择 foreachPartitions
     */

    val conf: SparkConf = new SparkConf()
    conf.setAppName("Demo19ForeachPartitions")
    conf.setMaster("local")

    val sc: SparkContext = new SparkContext(conf)

    val stuRDD: RDD[Student] = sc
      .textFile("Spark/data/students.txt")
      .map(line => {
        val splits: Array[String] = line.split(",")
        val id: String = splits(0)
        val name: String = splits(1)
        val age: Int = splits(2).toInt
        val gender: String = splits(3)
        val clazz: String = splits(4)
        Student(id, name, age, gender, clazz)
      })

    //假设将数据保存到MySQL
    stuRDD
      //对每个分区进行操作 没有返回值
      .foreachPartition(stuIter=>{
        //直接打印 保存到MySQL的逻辑 类似mapPartitions
        stuIter
          .foreach(stu=>{
            println(s"${stu.id},${stu.name}")
          })
      })
  }
}

package com.shujia.core

import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Demo20Cache {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    conf.setAppName("Demo20Cache")
    conf.setMaster("local")

    val sc: SparkContext = new SparkContext(conf)

    val stuRDD: RDD[String] = sc
      .textFile("Spark/data/students.txt")

    //统计班级人数
    val clazzRDD: RDD[(String, Int)] = stuRDD
      .map(line => (line.split(",")(4), 1))

    val clazzCntRDD: RDD[(String, Int)] = clazzRDD
      .reduceByKey(_ + _)

    //打印
    clazzCntRDD.foreach(println)

    //统计性别人数
    val genderRDD: RDD[(String, Int)] = stuRDD
      .map(line => (line.split(",")(3), 1))

    val genderCntRDD: RDD[(String, Int)] = genderRDD
      .reduceByKey(_ + _)

    //打印
   genderCntRDD.foreach(println)
  }
}

(理科二班,79)
(文科三班,94)
(理科四班,91)
(理科一班,78)
(文科五班,84)
(文科一班,72)
(文科四班,81)
(理科六班,92)
(理科三班,68)
(文科六班,104)
(理科五班,70)
(文科二班,87)

(男,507)
(女,493)

package com.shujia.core

import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Demo20Cache {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    conf.setAppName("Demo20Cache")
    conf.setMaster("local")

    val sc: SparkContext = new SparkContext(conf)

    val stuRDD: RDD[String] = sc
      .textFile("Spark/data/students.txt")

    val mapStuRDD: RDD[String] = stuRDD
      .map(line => {
        println("=====读取了Student数据=====")
        line
      })

    //统计班级人数
    val clazzRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(4), 1))

    val clazzCntRDD: RDD[(String, Int)] = clazzRDD
      .reduceByKey(_ + _)

    //打印
    clazzCntRDD.foreach(println)

    //统计性别人数
    val genderRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(3), 1))

    val genderCntRDD: RDD[(String, Int)] = genderRDD
      .reduceByKey(_ + _)

    //打印
   genderCntRDD.foreach(println)
  }
}

package com.shujia.core

import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD

object Demo20Cache {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    conf.setAppName("Demo20Cache")
    conf.setMaster("local")

    val sc: SparkContext = new SparkContext(conf)

    val stuRDD: RDD[String] = sc
      .textFile("Spark/data/students.txt")

    val mapStuRDD: RDD[String] = stuRDD
      .map(line => {
        println("=====读取了Student数据=====")
        line
      })

    //对多次使用的RDD进行cache 将其缓存到内存中
    mapStuRDD.cache()

    //统计班级人数
    val clazzRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(4), 1))

    val clazzCntRDD: RDD[(String, Int)] = clazzRDD
      .reduceByKey(_ + _)

    //打印
    clazzCntRDD.foreach(println)

    //统计性别人数
    val genderRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(3), 1))

    val genderCntRDD: RDD[(String, Int)] = genderRDD
      .reduceByKey(_ + _)

    //打印
   genderCntRDD.foreach(println)
  }
}

package com.shujia.core

import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

object Demo20Cache {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    conf.setAppName("Demo20Cache")
    conf.setMaster("local")

    val sc: SparkContext = new SparkContext(conf)

    val stuRDD: RDD[String] = sc
      .textFile("Spark/data/students.txt")

    val mapStuRDD: RDD[String] = stuRDD
      .map(line => {
        println("=====读取了Student数据=====")
        line
      })

    //对多次使用的RDD进行cache 默认会将其缓存到内存中
    //mapStuRDD.cache()

    //如果想要使用其他的缓存策略 需要适用persist()方法
    //mapStuRDD.persist(StorageLevel.MEMORY_ONLY)
    mapStuRDD.persist(StorageLevel.MEMORY_AND_DISK_SER)

    //统计班级人数
    val clazzRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(4), 1))

    val clazzCntRDD: RDD[(String, Int)] = clazzRDD
      .reduceByKey(_ + _)

    //打印
    clazzCntRDD.foreach(println)

    //统计性别人数
    val genderRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(3), 1))

    val genderCntRDD: RDD[(String, Int)] = genderRDD
      .reduceByKey(_ + _)

    //打印
   genderCntRDD.foreach(println)

  }
}

package com.shujia.core

import com.shujia.core.Demo10Join.Student
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

object Demo20Cache {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf()
    conf.setAppName("Demo20Cache")
    conf.setMaster("local")

    val sc: SparkContext = new SparkContext(conf)

    val stuRDD: RDD[String] = sc
      .textFile("Spark/data/students.txt")

    val mapStuRDD: RDD[String] = stuRDD
      .map(line => {
        println("=====读取了Student数据=====")
        line
      })

    //对多次使用的RDD进行cache 默认会将其缓存到内存中
    //mapStuRDD.cache()

    //如果想要使用其他的缓存策略 需要适用persist()方法
    //mapStuRDD.persist(StorageLevel.MEMORY_ONLY)
    mapStuRDD.persist(StorageLevel.MEMORY_AND_DISK_SER)

    //统计班级人数
    val clazzRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(4), 1))

    val clazzCntRDD: RDD[(String, Int)] = clazzRDD
      .reduceByKey(_ + _)

    //打印
    clazzCntRDD.foreach(println)

    //统计性别人数
    val genderRDD: RDD[(String, Int)] = mapStuRDD
      .map(line => (line.split(",")(3), 1))

    val genderCntRDD: RDD[(String, Int)] = genderRDD
      .reduceByKey(_ + _)

    //打印
   genderCntRDD.foreach(println)

    //使用完了记得释放缓存的RDD
    mapStuRDD.unpersist()

    while(true){

    }
  }
}