package com.ws.sparksql
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object SqlWordCount {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("SqlWordCount").master("local[*]").getOrCreate()
//Dataset : 只有一列,默认列名value
//Dataset分布式数据,进一步封装RDD,更加智能的RDD
val dataRdd: Dataset[String] = spark.read.textFile("hdfs://hadoop-01:9000/project")
//导入隐式转换
import spark.implicits._
//切分
val splitRdd: Dataset[String] = dataRdd.flatMap(_.split(" "))
//注册视图
splitRdd.createTempView("t_project")
//dataFrame方式 ,也可以使用聚合函数 agg()
// val result = splitRdd.groupBy($"value" as "keyWord").count().sort($"count" desc)
//sql方式
val result: DataFrame = spark.sql("select value as keyWord, count(*) as num " +
"from t_project group by keyWord order by num desc ")
result.show()
spark.stop()
}
}
结果 :
+-------+---+
|keyWord|num|
+-------+---+
| spark| 7|
| hadoop| 5|
| hive| 4|
| hbase| 3|
| flume| 2|
| sqoop| 1|
| ssqoop| 1|
+-------+---+