10.RDD基本操作

package rddbasic

import org.apache.spark.storage.StorageLevel
import org.apache.spark.{SparkConf, SparkContext}

/**
  * Created by hanq0 on 2017/10/23.
  */
object stuRdds {
  val conf = new SparkConf().setAppName("WordCount").setMaster("local")
  val sc = new SparkContext(conf)

  //内部集合
  def innnerRdds(): Unit = {
    val data = Array(1, 2, 3, 4, 5)
    val distData = sc.parallelize(data)
  }

  //外部数据
  def outterRdds(): Unit = {
    val distFile = sc.textFile("data.txt")
  }

  //基本操作
  def basicRdd(): Unit = {
    val lines = sc.textFile("data.txt")
    val lineLengths = lines.map(s => s.length)
    val totalLength = lineLengths.reduce((a, b) => a + b)
  }

  //调用函数
  def passFunctions(): Unit = {
    val lines = sc.textFile("data.txt")
    val lineLengths = lines.map(MyFunctions.mymap)
    //val totalLength = lineLengths.reduce(MyFunctions.myreduce())
  }

  //闭包,集群模式下变量被分隔
  def valuesDisparched(): Unit = {
    var counter = 0
    val data = Array(1, 2, 3, 4, 5)
    var rdd = sc.parallelize(data)
    // Wrong: Don't do this!!
    rdd.foreach(x => counter += x)
    println("Counter value: " + counter)
  }

  //对键值对(key-value)的操作
  def KeyValueactions(): Unit = {
    val lines = sc.textFile("data.txt")
    val pairs = lines.map(s => (s, 1))
    val counts = pairs.reduceByKey((a, b) => a + b)
  }

  //Rdd的转换Transformations
  def RddTransformations(): Unit = {
    val data0 = Array(1, 2, 3, 4, 5)
    val data1 = Array(6, 7, 8, 9, 0)
    val data2 = data0.union(data1)
    //sortByKey([ascending], [numTasks]),sample(withReplacement, fraction, seed),repartition(numPartitions)......
  }

  //Rdd的操作Actions
  def RddActions(): Unit = {
    val data = Array(1, 2, 3, 4, 5)
    var rdd = sc.parallelize(data)
    rdd.foreach(x => x+1)
    //reduce(func),collect(),count()......
  }

  //Rdd的持久化
  def RddStorage(): Unit = {
    val data = sc.textFile("data.txt")
    data.persist(StorageLevel.MEMORY_ONLY) // persist() 方法进行设置
    data.cache() //cache() 方法是使用默认存储级别的快捷设置方法
  }

  //共享变量

  //Broadcast variables(广播变量)允许程序员将一个 read-only(只读的)变量缓存到每台机器上
  def BroadcastVariables(): Unit = {
    val broadcastVar = sc.broadcast(Array(1, 2, 3))
    print(broadcastVar.value)
  }

  //Accumulators(累加器)是一个仅可以执行 “added”(添加)的变量来通过一个关联和交换操作
  def AccumulatorsVariables(): Unit = {
    val accum = sc.longAccumulator("My Accumulator")
    sc.parallelize(Array(1, 2, 3, 4)).foreach(x => accum.add(x))
    print(accum.value)
  }


}
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值