import org.apache.spark.{HashPartitioner, Partitioner, SparkConf, SparkContext}
/**
* Created by Administrator on 2018/7/3.
*/
object test {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("test").setMaster("local[12]")
val sc = new SparkContext(conf)
val book=Array(("大冰","阿弥陀佛么么哒"),("大冰","我不"),("柴静","看见"),("知乎","知乎"),
("胡赛尼","追风筝的人"),("大冰","你坏"),("钱钟书","围城"),("钱钟书","猫"),("大冰","好吗好的"))
sc.makeRDD(book).combineByKey(
v => (1,v),
(acc:(Int,String),v:String) => (acc._1 + 1,v),
(acc:(Int,String),acc1:(Int,String)) => (acc._1+acc1._1 , acc1._2),
new HashPartitioner(1),
false
).map {
case (k, (k1, v)) => Array(k, k1, v).mkString("\t")
}.foreach(println)
sc.stop()
}
}
combineByKey之Spark中一个比较核心高级函数
最新推荐文章于 2022-02-09 12:25:56 发布