package sparkday01 import org.apache.spark.rdd.RDD import org.apache.spark.{SparkConf, SparkContext} object WordCount { def main(args: Array[String]): Unit = { val config = new SparkConf() .setAppName("Spark wordcount") //local[num]使用num个线程模拟集群执行任务 //local[*]使用本地空闲线程模拟集群执行任务 //使用一个线程模拟集群执行任务 .setMaster("local[2]") //spark程序的入口,sparkContext val sc = new SparkContext(config) //实现spark wordcount //加载数据集,元素是文件里的每一行 val source:RDD[String]=sc.textFile("D:\\abc\\test1.txt",2) //groupByKey val res0 = source.flatMap(_.split(" ")).map((_,1)).groupByKey().map(x=>(x._1,x._2.sum)).collect() //reduceByKey val res1 = source.flatMap(_.split(" ")).map((_,1)).reduceByKey(_+_).collect() //aggregateByKey(初始值)(局部聚合,全局聚合) val res2 = source.flatMap(_.split(" ")).map((_,1)).aggregateByKey(0)(_+_,_+_).collect() //foldByKey(初始值)(聚合),(全局集合和局部聚合的处理逻辑一样) val res3 = source.flatMap(_.split(" ")).map((_,1)).foldByKey(0)(_+_).collect() //combineByKey(初始值类型可以和最后的处理结果类型不一致,局部聚合,全局聚合) val res4 = source.flatMap(_.split(" ")).map((_,1)).combineByKey(x=>x,(x:Int,y)=>x+y,(x:Int,y:Int)=>x+y).collect() println("groupByKey:"+res0.toBuffer) println("reduceByKey:"+res1.toBuffer) println("aggregateByKey:"+res2.toBuffer) println("foldByKey:"+res3.toBuffer) println("combineByKey:"+res4.toBuffer) //释放资源 sc.stop() } }
Spark 单词统计Wordcount
最新推荐文章于 2022-05-28 22:20:15 发布