import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
object sparkSqlWC {
def main(args: Array[String]): Unit = {
//定义一个sparkSession,getOrCreate相当于单例,如果有SparkSession对象直接使用,没有就直接创建
val sparkSession = SparkSession.builder().appName("sparkSqlwc").master("local[2]").getOrCreate()
//得到Dataset的数据集
val lines: Dataset[String] = sparkSession.read.textFile("D:\\数据\\person.txt")
lines.show() //没有压平输出的是一行,使用的字段名也是value
import sparkSession.implicits._ //这里需要一个隐式转换
val words: Dataset[String] = lines.flatMap(_.split(","))
//查看count()函数的源码可知道使用count()函数会自动给你定义一个字段为count,所以sort里面可以直接跟count
val counts: Dataset[Row] = words.groupBy($"value").count().sort($"count")
counts.show()
//(2)
//使用聚合函数,agg是聚合
import org.apache.spark.sql.functions._
val res: Dataset[Row] = words.groupBy($"value").agg(count("*") as "counts1").orderBy($"counts1" desc)
//res.show()
//(3)
//使用临时表
words.createTempView("t_wc")
val res0: DataFrame = sparkSession.sql("select value,count(*) as counts from t_wc group by value order by counts desc")
//执行Action
// res0.show()
sparkSession.stop()
}
}
dataSet版Wordcount
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
object DataSetWordCount {
def main(args: Array[String]): Unit = {
//创建SparkSession
val spark = SparkSession.builder()
.appName("DataSetWordCount")
.master("local[*]")
.getOrCreate()
//5,读取JDBC数据源
//步骤:
//(指定以后从哪里)读数据,是lazy
//Dataset分布式数据集,是对RDD的进一步封装,是更加智能的RDD
//dataset只有一列,默认这列叫value
val lines: Dataset[String] = spark.read.textFile("hdfs://node-4:9000/words")
//整理数据(切分压平)
//导入隐式转换
import spark.implicits._
val words: Dataset[String] = lines.flatMap(_.split(" "))
//使用DataSet的API(DSL)
//val count: DataFrame = words.groupBy($"value" as "word").count().sort($"count" desc)
//导入聚合函数
import org.apache.spark.sql.functions._
val counts = words.groupBy($"value" as "word").agg(count("*") as "counts").orderBy($"counts"
desc)
counts.show()
spark.stop()
}
}