原始数据
++
输出结果
+-----+------+
|hello| 5|
|world| 3|
| RDD| 2|
| hao| 1|
| sord| 1|
| ni| 1|
| c++| 1|
package os.unix
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object DataSetWordCount {
def main(args: Array[String]): Unit = {
//获得sparkSession
val session: SparkSession = SparkSession.builder()
.appName("WordCount")
.master("local")
.getOrCreate()
//导入session对象中的隐士转换
import session.implicits._
val lines: Dataset[String] = session.read.textFile("C:\\Users\\os\\Desktop\\test\\word.txt")
val words: Dataset[String] = lines.flatMap(_.split(" "))
//words.show()
//DSL $符号代表一列 word为别名
//为了使用agg中的聚合函数,导入spark sql中的函数
import org.apache.spark.sql.functions._
val result: Dataset[Row] = words.groupBy($"value" as "word").agg(count("*") as "counts").sort($"counts" desc )
result.show()
}
}
package os.unix
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object SqlWordCount {
def main(args: Array[String]): Unit = {
val session: SparkSession = SparkSession.builder().appName("sqlWordCount").master("local").getOrCreate()
//指定读取的数据
//val lines: DataFrame = session.read.format("text").load("C:\\Users\\os\\Desktop\\test\\word.txt")
val lines: Dataset[String] = session.read.textFile("C:\\Users\\os\\Desktop\\test\\word.txt")
//导入sparkSession对象中的隐士转换
import session.implicits._
val words: Dataset[String] = lines.flatMap(_.split(" "))
//SQL
//words.show()
val frame: DataFrame = words.withColumnRenamed("value","word")
//执行SQL 先创建临时视图
frame.createTempView("v_wc")
val result: DataFrame = session.sql("select word,count(*) counts from v_wc group by word order by counts desc")
result.show()
}
}