文章目录
前言
wordCount意为一段句子中相同单词的数量进行累加,最后打印所有单词和出现的数量。这看似是一个很简单的逻辑,原以为wordCount只是spark中的一个“hello world”,没想到spark其实就是做的wordCount,比如某段实践某类型数据的汇总 ,因此,在这里列举几个wordCount的方式,供大家参考。
1、groupBy
def main(args: Array[String]): Unit = {
val sparkconf = new SparkConf().setMaster("local[*]").setAppName("test")
var sc = new SparkContext(sparkconf);
val data = sc.makeRDD(List("hello spark","hello java"))
val value = data.map(_.split(" "))
.flatMap(num => num)
.groupBy(num => num).mapValues(iters => {
iters.size;
})
value.collect().foreach(println)
sc.stop();
}
输出:
(spark,1)
(hello,2)
(java,1)
2、groupByKey
def main(args: Array[String]): Unit = {
val sparkconf = new SparkConf().setMaster("local[*]").setAppName("test")
var sc = new SparkContext(sparkconf);
val data = sc.makeRDD(List("hello spark","hello java"))
val value = data.map(_.split(" "))
.flatMap(num => num)
.map(num=>(num,1))
.groupByKey().mapValues(iters => {
iters.size;
})
value.collect().foreach(println)
sc.stop();
}
(spark,1)
(hello,2)
(java,1)
3、reduceByKey
spark的RDD方法中有reduceByKey这个聚合方法。通过reduceByKey可很快实现wordCount
def main(args: Array[String]): Unit = {
val sparkconf = new SparkConf().setMaster("local[*]").setAppName("test")
var sc = new SparkContext(sparkconf);
val data = sc.makeRDD(List("hello spark","hello java"))
val value = data.map(_.split(" "))
.flatMap(num => num)
.map(num=>(num,1)).reduceByKey(_+_);
value.collect().foreach(println)
sc.stop();
}
输出:
(spark,1)
(hello,2)
(java,1)
4、aggregateByKey
def main(args: Array[String]): Unit = {
val sparkconf = new SparkConf().setMaster("local[*]").setAppName("test")
var sc = new SparkContext(sparkconf);
val data = sc.makeRDD(List("hello spark","hello java"))
val value = data.map(_.split(" "))
.flatMap(num => num)
.map(num=>(num,1)).aggregateByKey(0)(_+_,_+_);
value.collect().foreach(println)
sc.stop();
}
输出:
(spark,1)
(hello,2)
(java,1)
5、foldByKey
def main(args: Array[String]): Unit = {
val sparkconf = new SparkConf().setMaster("local[*]").setAppName("test")
var sc = new SparkContext(sparkconf);
val data = sc.makeRDD(List("hello spark","hello java"))
val value = data.map(_.split(" "))
.flatMap(num => num)
.map(num=>(num,1)).foldByKey(0)(_+_);
value.collect().foreach(println)
sc.stop();
}
(spark,1)
(hello,2)
(java,1)
6、combineByKey
def main(args: Array[String]): Unit = {
val sparkconf = new SparkConf().setMaster("local[*]").setAppName("test")
var sc = new SparkContext(sparkconf);
val data = sc.makeRDD(List("hello spark","hello java"))
val value = data.map(_.split(" "))
.flatMap(num => num)
.map(num=>(num,1)).combineByKey(
d=>d,
(x:Int,y)=>x+y,
(x:Int,y:Int)=>x+y
)
value.collect().foreach(println)
sc.stop();
}
7、countByKey
def main(args: Array[String]): Unit = {
val sparkconf = new SparkConf().setMaster("local[*]").setAppName("test")
var sc = new SparkContext(sparkconf);
val data = sc.makeRDD(List("hello spark","hello java"))
val value = data.map(_.split(" "))
.flatMap(num => num)
.map(num=>(num,1)).countByKey();
value.foreach(println)
sc.stop();
}
输出:
(spark,1)
(hello,2)
(java,1)
8、countByValue
def main(args: Array[String]): Unit = {
val sparkconf = new SparkConf().setMaster("local[*]").setAppName("test")
var sc = new SparkContext(sparkconf);
val data = sc.makeRDD(List("hello spark","hello java"))
val value = data.map(_.split(" "))
.flatMap(num => num).countByValue();
value.foreach(println)
sc.stop();
}