object wordcount {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local").setAppName("wordcount")
val sc: SparkContext = new SparkContext(conf)
Wordcount11(sc)
sc.stop()
}
// groupBy
def Wordcount1(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
val words=rdd.flatMap(_.split(" "))
val group: RDD[(String, Iterable[String])] = words.groupBy(word=>word)
val wordCount: RDD[(String, Int)] = group.mapValues(iter=>iter.size)
wordCount.collect().foreach(println)
}
//groupByKey
def Wordcount2(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
val words=rdd.flatMap(_.split(" "))
val wordOne=words.map((_,1))
val groupBy: RDD[(String, Iterable[Int])] = wordOne.groupByKey()
val wordCount: RDD[(String, Int)] = groupBy.mapValues(iter=>iter.size)
wordCount.collect().foreach(println)
}
//reduceByKey
def Wordcount3(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
val words=rdd.flatMap(_.split(" "))
val wordOne=words.map((_,1))
val wordCount: RDD[(String, Int)] = wordOne.reduceByKey(_+_)
wordCount.collect().foreach(println)
}
//aggregateByKey
def Wordcount4(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
val words=rdd.flatMap(_.split(" "))
val wordOne=words.map((_,1))
val wordCount: RDD[(String, Int)] = wordOne.aggregateByKey(0)(_+_,_+_)
wordCount.collect().foreach(println)
}
//foldByKey
def Wordcount5(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
val words=rdd.flatMap(_.split(" "))
val wordOne=words.map((_,1))
val wordCount: RDD[(String, Int)] = wordOne.foldByKey(0)(_+_)
wordCount.collect().foreach(println)
}
//combineByKey
def Wordcount6(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
val words=rdd.flatMap(_.split(" "))
val wordOne=words.map((_,1))
val wordCount: RDD[(String, Int)] = wordOne.combineByKey(
v=>v,
(x:Int,y)=>x+y,
(t1,t2)=>t1+t2
)
wordCount.collect().foreach(println)
}
//countByKey
def Wordcount7(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
val words=rdd.flatMap(_.split(" "))
val wordOne=words.map((_,1))
val wordCount: collection.Map[String, Long] = wordOne.countByKey()
wordCount.foreach(println)
}
//countByValue
def Wordcount8(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
val words=rdd.flatMap(_.split(" "))
val wordCount: collection.Map[String, Long] = words.countByValue()
wordCount.foreach(println)
}
//reduce
def Wordcount9(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
val words=rdd.flatMap(_.split(" "))
val mapWord=words.map(word =>{
mutable.Map[String, Long]((word, 1))
})
val wordCount=mapWord.reduce(
(map1,map2) =>{
println(map1," ",map2)
map2.foreach{ //无法直接获取集合中的k v,所以需要循环遍历
case(word,count)=>{
val newCount=map1.getOrElse(word,0L)+count
map1.update(word,newCount)
}
}
map1
}
)
wordCount.foreach(println)
}
//aggregate
def Wordcount10(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
val words=rdd.flatMap(_.split(" "))
val mapWord=words.map(word =>{
mutable.Map[String, Long]((word, 1))
})
val wordCount=mapWord.aggregate( mutable.Map[String, Long](("##",0)))(
(map1,map2) =>{
map2.foreach{ //无法直接获取集合中的k v,所以需要循环遍历
case(word,count)=>{
val newCount=map1.getOrElse(word,0L)+count
map1.update(word,newCount)
}
}
map1
},
(map1,map2) =>{
map2.foreach{ //无法直接获取集合中的k v,所以需要循环遍历
case(word,count)=>{
val newCount=map1.getOrElse(word,0L)+count
map1.update(word,newCount)
}
}
map1
}
)
wordCount.filterKeys(_!="##")foreach(println)
}
//fold
def Wordcount11(sc:SparkContext):Unit={
val rdd: RDD[String] = sc.makeRDD(List("Hello Scala","Hello Spark"))
val words=rdd.flatMap(_.split(" "))
val mapWord=words.map(word =>{
mutable.Map[String, Long]((word, 1))
})
val wordCount=mapWord.fold( mutable.Map[String, Long](("##",0)))(
(map1,map2) =>{
map2.foreach{ //无法直接获取集合中的k v,所以需要循环遍历
case(word,count)=>{
val newCount=map1.getOrElse(word,0L)+count
map1.update(word,newCount)
}
}
map1
}
)
wordCount.filterKeys(_!="##")foreach(println)
}
}
spark05-实现wordcount的11种方式
于 2023-02-15 11:38:43 首次发布