Spark各个版本WordCount介绍
spark分为3种数据类型:RDD,DataSet,DataFrame
下面介绍使用3种不同数据类型的WordCount程序
初始数据集:
a,b
c,a
d,b
a,d
1. 初始变量类型RDD
使用SparkContext
读取文件方法:
sc.textFile("./data/dataset/data.txt")
返回数据类型:
RDD
object WordCount {
def main(args: Array[String]): Unit = {
val sc = new SparkContext(new SparkConf().setAppName("wc").setMaster("local[*]"))
sc.textFile("./data/dataset/data.txt")
.flatMap(_.split(","))
.map((_, 1))
.reduceByKey(_ + _)
.foreach(println)
sc.stop()
}
}
运行结果
(d,2)
(a,3)
(b,2)
(c,1)
2. 初始变量类型DataFrame
使用SparkSession
读取文件方法:
spark.read.text("./src/main/data/data.txt")
返回数据类型:
DataFrame
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object WordCount {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.master("local[*]")
.appName(name = s"Read_Write")
.getOrCreate()
import spark.implicits._
val data = spark.read.text("./src/main/data/data.txt")
.map(line=>{
val colArr =line.getString(0)
.split(",")
val col1 = colArr(0)
val col2 = colArr(1)
(col1)
})
.union(spark.read.text("./src/main/data/data.txt")
.map(line=>{
val colArr =line.getString(0)
.split(",")
val col1 = colArr(0)
val col2 = colArr(1)
(col2)
}))
.groupBy("value")
.count()
.show()
}
}
运行结果
±----±----+
|value|count|
±----±----+
| d| 2|
| c| 1|
| b| 2|
| a| 3|
±----±----+
3. 初始变量类型DataSet
使用SparkSession
读取文件方法:
spark.read.textFile("./src/main/data/data.txt")
返回数据类型:
DataSet
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object WordCount {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.master("local[*]")
.appName(name = s"Read_Write")
.getOrCreate()
import spark.implicits._
val data_2 = spark.read.textFile("./src/main/data/data.txt")
.flatMap(value => value.split(","))
.groupByKey(_.toLowerCase)
.count()
.show()
spark.stop()
}
}
运行结果
±----±-------+
|value|count(1)|
±----±-------+
| d| 2|
| c| 1|
| b| 2|
| a| 3|
±----±-------+
4.版本2
使用SparkSession
读取文件方法:
spark.read.text("./src/main/data/data.txt").as[String]
将DataFrame转换成DataSet
返回数据类型:
DataSet
//DataSet2版
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object WordCount {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder()
.master("local[*]")
.appName(name = s"Read_Write")
.getOrCreate()
import spark.implicits._
val data_3 = spark.read.text("./src/main/data/data.txt").as[String]
val words = data_3.flatMap(value => value.split(","))
val groupedWords = words.groupByKey(_.toLowerCase)
val counts = groupedWords.count()
counts.show()
spark.stop()
}
}