假数据:
hello weige
hello AJ
hello chouji
jige love AJ
weige love AJ
AJ AJ AJ I love you
/**
* 利用sparksession的SQL进行wordcount
*/
object sqlWordCount{
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName("sqlWordCount")
.master("local[*]")
.getOrCreate()
//(从某处)读取数据,返回Dataset,这也是一个Transformation
val lines: Dataset[String] = spark.read.textFile("hdfs://hdp-1:9000/testFile/wordcount.txt")
println("----------打印Dataset----------")
lines.show()
//添加隐式转换
import spark.implicits._
//整理数据切分压平
//Dataset只有一列,默认列名为value
val words: Dataset[String] = lines.flatMap(_.split(" "))
println("----------打印切分压平之后的Dataset----------")
words.show()
//注册视图
words.createTempView("t_wc")
//执行sql(lazy)
val dataFrame: DataFrame = spark.sql("select value, count(*) counts from t_wc group by value order by value desc")
//执行计算
println("----------wordcount统计----------")
dataFrame.show()
}
}
运行结果:
----------打印Dataset----------
+-------------------+
| value|
+-------------------+
| hello weige|
| hello AJ|
| hello chouji|
| jige love AJ|
| weige love AJ|
|AJ AJ AJ I love you|
+-------------------+
----------打印切分压平之后的Dataset----------
+------+
| value|
+------+
| hello|
| weige|
| hello|
| AJ|
| hello|
|chouji|
| jige|
| love|
| AJ|
| weige|
| love|
| AJ|
| AJ|
| AJ|
| AJ|
| I|
| love|
| you|
+------+
----------wordcount统计----------
+------+------+
| value|counts|
+------+------+
| you| 1|
| weige| 2|
| love| 3|
| jige| 1|
| hello| 3|
|chouji| 1|
| I| 1|
| AJ| 6|
+------+------+
Process finished with exit code 0
/**
* 利用sparksession的Dataset(DEL)API进行wordcount
*/
object DELWordcount{
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName("DELWordcount")
.master("local[*]")
.getOrCreate()
val lines: Dataset[String] = spark.read.textFile("hdfs://hdp-1:9000/testFile/wordcount.txt")
//添加隐式转换
import spark.implicits._
val words: Dataset[String] = lines.flatMap(_.split(" "))
//导入聚合函数(agg表示聚合)
import org.apache.spark.sql.functions._
val r: Dataset[Row] = words.groupBy($"value" as "word").agg(count("*") as "counts").orderBy($"counts" desc)
val r2: Dataset[Row] = words.groupBy($"value" as "word").agg(count("*") as "counts").orderBy($"counts" desc,$"word" asc)
println("----r----")
r.show()
println("----r2----")
r2.show()
}
}
运行结果:
----r----
+------+------+
| word|counts|
+------+------+
| AJ| 6|
| love| 3|
| hello| 3|
| weige| 2|
| you| 1|
| jige| 1|
| I| 1|
|chouji| 1|
+------+------+
----r2----
+------+------+
| word|counts|
+------+------+
| AJ| 6|
| hello| 3|
| love| 3|
| weige| 2|
| I| 1|
|chouji| 1|
| jige| 1|
| you| 1|
+------+------+
Process finished with exit code 0