//方式1 map + reduceByKey
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object WordCount01 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("reduceByKey")
.master("local[2]")
.getOrCreate()
val sc = spark.sparkContext
val lines:RDD[String] = sc.textFile("data/thatgirl.txt")
//扁平化操作,拆分数据为 一个单词一行
val word:RDD[String] = lines.flatMap(_.split(" "))
//map转换为 (key,1)
val mapRDD:RDD[(String,Int)] = word.map((_, 1))
//reduceByKey根据key进行聚合
val res:RDD[(String,Int)] = mapRDD.reduceByKey(_ + _)
res.collect.foreach(println)
sc.stop()
}
}
//方式2 map + groupByKey + mapValues
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object WordCount02 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("groupByKey")
.master("local[2]")
.getOrCreate()
val sc = spark.sparkContext
val lines:RDD[String] = sc.textFile("data/thatgirl.txt")
//扁平化操作,拆分数据为 一个单词一行
val word:RDD[String] = lines.flatMap(_.split(" "))
//map转换为 (key,1)
val mapRDD:RDD[(String,Int)] = word.map((_, 1))
//根据key进行分组 (key,(1,1,1...))
val groupRDD:RDD[(String,Iterable[Int])] = mapRDD.groupByKey()
//取得集合的value值 (这里是一个迭代器,直接获取迭代器的长度)
val res:RDD[(String,Int)] = groupRDD.mapValues(iter => iter.size)
res.collect.foreach(println)
sc.stop()
}
}
//方式3 groupBy + mapValues
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
object WordCount03 {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.appName("groupBy")
.master("local[2]")
.getOrCreate()
val sc = spark.sparkContext
val lines:RDD[String] = sc.textFile("data/thatgirl.txt")
//扁平化操作,拆分数据为 一个单词一行
val word:RDD[String] = lines.flatMap(_.split(" "))
//groupBy和groupByKey类似,只是它不要求必须是key-value的集合形式数据
//它可以是单条数据进行聚合
val groupRDD:RDD[(String,Iterable[String])] = word.groupBy( v => v )
//直接获取迭代器的长度
val res:RDD[(String,Int)] = groupRDD.mapValues( iter => iter.size)
res.collect.foreach(println)
sc.stop()
}
}