【Data Algorithms_Recipes for Scaling up with Hadoop and Spark】Chapter1 Secondary Sort

最新推荐文章于 2016-04-05 19:43:29 发布

baibaiw5

最新推荐文章于 2016-04-05 19:43:29 发布

阅读量555

点赞数

spark 专栏收录该内容

28 篇文章 0 订阅

订阅专栏

：最近看了《Data Algorithms_Recipes for Scaling up with Hadoop and Spark》，其中的算法采用Java实现，下载路径为

源码下载https://github.com/mahmoudparsian/data-algorithms-book/

：本着学习的目的，现提供scala版本的算法Secondary Sort

package com.bbw5.dataalgorithms.spark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.Logging

/**
 * SparkSecondarySort class implemets the secondary sort design pattern
 * by sorting reducer values in memory/RAM.
 *
 *
 * Input:
 *
 *    name, time, value
 *    x,2,9
 *    y,2,5
 *    x,1,3
 *    y,1,7
 *    y,3,1
 *    x,3,6
 *    z,1,4
 *    z,2,8
 *    z,3,7
 *    z,4,0
 *
 * Output: generate a time-series looking like this:
 *
 *       t1 t2 t3 t4
 *  x => [3, 9, 6]
 *  y => [7, 5, 1]
 *  z => [4, 8, 7, 0]
 *
 *  x => [(1,3), (2,9), (3,6)]
 *  y => [(1,7), (2,5), (3,1)]
 *  z => [(1,4), (2,8), (3,7), (4,0)]
 *
 *
 * @author bbw5
 *
 */
case class TestObj(name: String, time: Int, value: Int)

object SparkSecondarySort extends Logging {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("SparkSecondarySort")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new org.apache.spark.sql.SQLContext(sc)

    val filename = "D:/temp/data/ss2.txt"
    val textFile = sc.textFile(filename)
    //使用RDD进行排序
    val outputs = textFile.map { line =>
      val array = line.split(",")
      (array(0), (array(1), array(2)))
    }.groupByKey().mapValues(iter => iter.toArray.sortBy(a => a._1).toList)
    outputs.collect.foreach(println)

    // this is used to implicitly convert an RDD to a DataFrame.
    import sqlContext.implicits._
    val df = textFile.map(_.split(",")).map { t => TestObj(t(0), t(1).toInt, t(2).toInt) }.toDF()
    df.show()
    df.printSchema()
    df.groupBy("name").count().show()
    
    df.registerTempTable("test")
    //使用DataFrame进行排序
    val ssDf = sqlContext.sql("SELECT name,time,value FROM test order by name,time ")
    ssDf.map (r => (r(0),(r(1),r(2)))).groupByKey().collect.foreach(println)  
  }
}