1 map + reduceByKey
sparkContext.textFile("hdfs://ifeng:9000/hdfsapi/wc.txt")
.flatMap(_.split(","))
.map((_,1))
.reduceByKey(_+_).collect()
2 countByValue代替map + reduceByKey
val RDDfile = sparkContext.textFile("hdfs://ifeng:9000/hdfsapi/wc.txt")
RDDfile.flatMap(_.split(",")).countByValue.foreach(println)
3 aggregateByKey
RDDfile.flatMap(_.split(",")).map((_,1)).aggregateByKey(0)(_ + _, _ + _).collect().foreach(println)
4 foldByKey
RDDfile.flatMap(_.split(",")).map((_,1)).foldByKey(0)(_ + _).collect().foreach(println)
5 groupByKey+map
RDDfile.flatMap(_.split(",")).map((_, 1)).groupByKey().map(tuple => {
(tuple._1, tuple._2.sum)
}).collect().foreach(println)
6 combineByKey
RDDfile.flatMap(_.split(",")).map((_, 1)).combineByKey(
x => x,
(x: Int, y: Int) => x + y,
(x: Int, y: Int) => x + y
).collect().foreach(println)
1 groupBy
fileRDD.flatMap(_.split(","))
.groupBy(x => x)
.map(x => {
(x._1 , x._2.size)
}).foreach(println)
2 groupBykey
fileRDD.flatMap(_.split(","))
.map((_,1))
.groupByKey()
.map(x => {
(x._1,x._2.sum)
}).foreach(println)