:最近看了《Data Algorithms_Recipes for Scaling up with Hadoop and Spark》,其中的算法采用Java实现,下载路径为
源码下载https://github.com/mahmoudparsian/data-algorithms-book/
:本着学习的目的,现提供scala版本的算法Secondary Sort
package com.bbw5.dataalgorithms.spark
import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.Logging
/**
* SparkSecondarySort class implemets the secondary sort design pattern
* by sorting reducer values in memory/RAM.
*
*
* Input:
*
* name, time, value
* x,2,9
* y,2,5
* x,1,3
* y,1,7
* y,3,1
* x,3,6
* z,1,4
* z,2,8
* z,3,7
* z,4,0
*
* Output: generate a time-series looking like this:
*
* t1 t2 t3 t4
* x => [3, 9, 6]
* y => [7, 5, 1]
* z => [4, 8, 7, 0]
*
* x => [(1,3), (2,9), (3,6)]
* y => [(1,7), (2,5), (3,1)]
* z => [(1,4), (2,8), (3,7), (4,0)]
*
*
* @author bbw5
*
*/
case class TestObj(name: String, time: Int, value: Int)
object SparkSecondarySort extends Logging {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("SparkSecondarySort")
val sc = new SparkContext(sparkConf)
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val filename = "D:/temp/data/ss2.txt"
val textFile = sc.textFile(filename)
//使用RDD进行排序
val outputs = textFile.map { line =>
val array = line.split(",")
(array(0), (array(1), array(2)))
}.groupByKey().mapValues(iter => iter.toArray.sortBy(a => a._1).toList)
outputs.collect.foreach(println)
// this is used to implicitly convert an RDD to a DataFrame.
import sqlContext.implicits._
val df = textFile.map(_.split(",")).map { t => TestObj(t(0), t(1).toInt, t(2).toInt) }.toDF()
df.show()
df.printSchema()
df.groupBy("name").count().show()
df.registerTempTable("test")
//使用DataFrame进行排序
val ssDf = sqlContext.sql("SELECT name,time,value FROM test order by name,time ")
ssDf.map (r => (r(0),(r(1),r(2)))).groupByKey().collect.foreach(println)
}
}