Spark代码可读性与性能优化——示例四

31 篇文章 1 订阅
27 篇文章 3 订阅

Spark代码可读性与性能优化——示例四

1. 内容点大纲

  • 更简洁的写法
  • 易懂的写法
  • 返回类型提示
  • 本地打印提示
  • 代码换行提示
  • 占位符提示
    *注意:和前面文章内容重复的不再做提示,已直接修改

2. 原代码(来自GitHub)

import org.apache.spark.{SparkContext, SparkConf}

object Ex4_MoreOperationsOnRDDs {
  def main(args: Array[String]) {
    val conf = new SparkConf().setAppName("Ex4_MoreOperationsOnRDDs").setMaster("local[4]")
    val sc = new SparkContext(conf)

    // put some data in an RDD
    val letters = sc.parallelize('a' to 'z', 8)

    // vowels, but NOT an RDD
    // this is important because you can't access RDDs in
    // operations on individual elements of an RDD
    val vowels = Seq('a', 'e', 'i', 'o', 'u')

    val consonants = letters.filter(c => !vowels.contains(c))
    println("There are " + consonants.count() + " consonants")

    // can use a partial function to filter and transform
    // notice the new RDD in this case has a different type
    val consonantsAsDigits = letters collect {
      case c:Char if !vowels.contains(c) => c.asDigit
    }
    consonantsAsDigits.foreach(println)

    // TODO: pay more attention to flatMap
    // flatMap: pay attention here -- the words don't always come
    // out in the right order, but the characters within a word do
    val words = sc.parallelize(Seq("Hello", "World"), 2)
    val chars = words.flatMap(w => w.iterator)
    println(chars.map(c => c.toString).reduce((s1, s2) => s1 + " " + s2))

    // groupBy
    val numbers = sc.parallelize(1 to 10, 4)
    val modThreeGroups = numbers.groupBy(_ % 3)
    // Note: pair RDDs are special in Spark
    modThreeGroups foreach {
      case (m, vals) => println("mod 3 = " + m + ", count = " + vals.count(_ => true))
    }

    // countByValue: how many of the mod results are three
    // notice use of Option type
    // TODO: what point was I trying to make?
    val mods = modThreeGroups.collect({
      case (m, vals) => vals.count(_ => true) }).countByValue
    println("results found 3 times: " + mods.get(3))
    println("results found 4 times: " + mods.get(4))
    println("results found 7 times: " + mods.get(7))

    // max, min
    println("maximum element = " + letters.max())

    // first
    println("first element = " + letters.first())

    // sample: notice this returns an RDD
    println("random [fractional] sample without replacement: ")
    val samp = letters.sample(false, 0.25, 42)
    samp.foreach(println)

    // sortBy
    // TODO: this deserves a clearer example of how spiffy it is
    println("first element when reversed = " + letters.sortBy(v => v, false).first())

    // take, takeSample: these just return arrays
    println("first five letters:")
    val first5 = letters.take(5)
    first5.foreach(println)

    println("random five letters without replacement:")
    val random5 = letters.takeSample(false, 5)
    random5.foreach(println)
  }
}

3. 优化后的代码+注释

import org.apache.spark.{SparkConf, SparkContext}

object Ex4_MoreOperationsOnRDDs_FIX {
  def main(args: Array[String]) {
    // 添加RDD、DF后缀
    // persist()缓存

    val conf = new SparkConf().setAppName("Ex4_MoreOperationsOnRDDs").setMaster("local[4]")
    val sc = new SparkContext(conf)

    // put some data in an RDD
    val letterRDD = sc.parallelize('a' to 'z', 8)
    letterRDD.persist()

    // vowels, but NOT an RDD
    // this is important because you can't access RDDs in
    // operations on individual elements of an RDD
    val vowels = Seq('a', 'e', 'i', 'o', 'u')

    val consonants = letterRDD.filter(c => !vowels.contains(c))
    println("There are " + consonants.count() + " consonants")

    // can use a partial function to filter and transform
    // notice the new RDD in this case has a different type
    val consonantsAsDigits = letterRDD collect {
      case c: Char if !vowels.contains(c) => c.asDigit
    }
    consonantsAsDigits.foreach(println)

    // TODO: pay more attention to flatMap
    // flatMap: pay attention here -- the wordRDD don't always come
    // out in the right order, but the characters within a word do
    val wordRDD = sc.parallelize(Seq("Hello", "World"), 2)
    val chars = wordRDD.flatMap(w => w.iterator)
    //    println(chars.map(c => c.toString).reduce((s1, s2) => s1 + " " + s2))
    // 更简洁的写法
    println(chars.collect().mkString(" "))

    // groupBy
    val numberRDD = sc.parallelize(1 to 10, 4)
    val modThreeGroupRDD = numberRDD.groupBy(_ % 3)
    // Note: pair RDDs are special in Spark
    modThreeGroupRDD.foreach {
      case (mod, vals) =>
        //        println("mod 3 = " + m + ", count = " + vals.count(_ => true))
        // 推荐这样写
        println(s"mod 3 = $mod, count = ${vals.size}")
    }

    // countByValue: how many of the mod results are three
    // notice use of Option type
    // TODO: what point was I trying to make?
//    val mods = modThreeGroupRDD.collect({
//      case (m, vals) => vals.count(_ => true) }).countByValue
    val mods = modThreeGroupRDD.collect {
      case (mod, vals) => vals.size
    }.countByValue
    println("results found 3 times: " + mods.get(3))
    println("results found 4 times: " + mods.get(4))
    println("results found 7 times: " + mods.get(7))

    // max, min
    println("maximum element = " + letterRDD.max())

    // first
    println("first element = " + letterRDD.first())

    // sample: notice this returns an RDD
    println("random [fractional] sample without replacement: ")
    val sampRDD = letterRDD.sample(false, 0.25, 42)
    // sample返回的是一个RDD,如果要在本地打印,一定要注意collect
    sampRDD.collect().foreach(println)

    // sortBy
    // TODO: this deserves a clearer example of how spiffy it is
//    println("first element when reversed = " + letterRDD.sortBy(v => v, false).first())
    // 书写方式推荐这样换成2行,分开打印和RDD操作
    val first = letterRDD.sortBy(c => c, false).first()
    println(s"first element when reversed = $first")

    // take, takeSample: these just return arrays
    println("first five letterRDD:")
    val first5 = letterRDD.take(5)
    first5.foreach(println)

    println("random five letterRDD without replacement:")
    val random5 = letterRDD.takeSample(false, 5)
    random5.foreach(println)

    sc.stop()
  }
}
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值