Spark中combinByKey实现reduceByKey和groupByKey具体的实现函数
/**
* combinByKey来实现ReduceByKey
* @param sc
*/
def combineByKeyToReduceBy(sc:SparkContext): Unit ={
val rdd = sc.parallelize(List(("a",1),("b",2),("c",3),("a",2),("a",3),("b",3),("c",2)))
rdd.reduceByKey(_+_).foreach(println)
println("这个是reduceByKey的示例,下面为使用combineByKey做ReduceByKey")
val reduceValue: RDD[(String, Int)] = rdd.combineByKey(
createCombiner1,
mergeValue1,
mergeCombiners1
)
reduceValue.foreach(println)
}
//这里的返回值成为下面的merValue1的c,参数v为传入rdd的键值对的值
def createCombiner1(v:Int): Int ={
v
}
//这里的参数c为一个迭代值,c+v的值成为c,不断迭代,
// 参数v为键值对中的值,这里为同分区进行迭代
def mergeValue1(c:Int,v:Int): Int ={
c+v
}
//这里为不同分区的迭代,不同分区分别计算之后,再根据key将对应的value值相加
// ,这里的c1和c2类似于上面函数那个的参数的作用
def mergeCombiners1(c1:Int,c2:Int): Int ={
c1+c2
}
/**
* combineByKey来实现groupByKey
* @param sc
*/
def combineByKeyToGroupBy(sc:SparkContext): Unit ={
val rdd = sc.parallelize(List(("a",1),("b",2),("c",3),("a",2),("a",3),("b",3),("c",2)))
rdd.groupByKey().foreach(println)
val groupByValue: RDD[(String, ListBuffer[String])] = rdd.combineByKey(
createCombiner,
mergeValue,
mergeCombiners
)
groupByValue.foreach(println)
}
def createCombiner(v:Int):ListBuffer[String] = {
println("======== this is createCombiner ==========")
val list = new ListBuffer[String];
list.append(v.toString)
list
}
def mergeValue(ls:ListBuffer[String],v:Int):ListBuffer[String]={
println("======== this is mergeValue ==========")
ls.append(v.toString)
ls
}
def mergeCombiners(ls1:ListBuffer[String],ls2:ListBuffer[String]):ListBuffer[String]={
ls1++ls2
}
main函数
def main(args: Array[String]): Unit = {
val sc:SparkContext = SparkUtil.getDefaultSparkContest(SparkRDD.getClass.getName)
//此处写函数测试
}
util工具类
object SparkUtil {
def main(args: Array[String]): Unit = {
}
val DEFAULT_MASTER_URL = "local[*]";
def getSparkContext(appName:String,master:String):SparkContext
= new SparkContext(new SparkConf().setAppName(appName).setMaster(master));
def getDefaultSparkContest(appName:String):SparkContext = getSparkContext(appName,DEFAULT_MASTER_URL);
def closeSparkContext(sc:SparkContext):Unit = if(sc != null) sc.close();
}