package second_study
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object many_wordcounts {
def main(args: Array[String]): Unit = {
//创建spark环境
val sparkconf = new SparkConf().setAppName("wordcount").setMaster("local[*]")
val sc = new SparkContext(sparkconf)
//读取文件
val value: RDD[String] = sc.textFile("E:\\idea\\IdeaProjects\\Spark-Project\\Spark-Core\\src\\main\\resources\\data.txt")
//map + groupByKey + mapValues
val value1: RDD[String] = value.flatMap(_.split(" "))
val value2: RDD[(String, Int)] = value1.map((_,1))
val value3: RDD[(String, Iterable[Int])] = value2.groupByKey()
val res: RDD[(String, Int)] = value3.mapValues(_.size)
//groupby + mapvalue
val value4: RDD[String] = value.flatMap(_.split(" "))
val value5: RDD[(String, Iterable[String])] = value4.groupBy((data: String) => data)
val res2 = value5.mapValues(_.size)
val value6: RDD[String] = value.flatMap(_.split(" "))
val value7: RDD[(String, Int)] = value6.map((_,1))
//map + reduceByKey
val res3: RDD[(String, Int)] = value7.reduceByKey(_+_)
//map + aggregateByKey
val res4: RDD[(String, Int)] = value7.aggregateByKey(0)(_ + _, _ + _)
//map + foldByKey
val res5: RDD[(String, Int)] = value7.foldByKey(0)(_ + _)
//map + combineByKey
val res6: RDD[(String, Int)] = value7.combineByKey(
x => x,
(x: Int, y: Int) => {
x + y
},
(x: Int, y: Int) => {
x + y
}
)
//map + countByKey
val res7: collection.Map[String, Long] = value7.countByKey()
//countByValue
val res8: collection.Map[String, Long] = value6.countByValue()
sc.stop()
}
}
spark rdd实现WordCount的八种方法
最新推荐文章于 2024-06-04 10:58:23 发布