需要用到的依赖
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.4.4</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.4.4</version>
</dependency>
查询员工信息和部门员工的最高工资
object SparkSQLExample {
def main(args: Array[String]): Unit = {
//1.创建Spark SQL 应用运行环境对象
val spark = SparkSession.builder().appName("quick example").master("local[*]").getOrCreate()
//2.创建数据源对象 Dataset 和 DataFrame
import spark.implicits._
spark
.sparkContext.makeRDD(List(
(1, "zs", 18, 1500, "A", true),
(2, "ls", 20, 3000, "B", false),
(3, "ww", 20, 2000, "B", false),
(4, "zl", 30, 3500, "A", true),
(3, "tq", 40, 2500, "A", false)
))
//字段起别名
.toDF("id", "name", "age", "salary", "dept_id", "sex")
.createOrReplaceTempView("t_user")
//窗口函数语法
spark
.sql("select id,name,age,salary,dept_id,sex, max(salary) over(partition by dept_id) as dept_max_salary from t_user")
.show()
//4.释放资源
spark.show()
}
}
统计某天每个用户访问次数前十的页面
object SparkSQLExample2 {
def main(args: Array[String]): Unit = {
//1.创建Spark SQL 应用运行环境对象
val spark = SparkSession.builder().appName("quick example").master("local[*]").getOrCreate()
//2.构建数据源对象 Dataset 和 DataFrame
//导入外部的隐式转换
import spark.implicits._
spark
.sparkContext
.makeRDD(
List(
("2018-01-01", 1, "www.baidu.com", "10:01"),
("2018-01-01", 2, "www.baidu.com", "10:01"),
("2018-01-01", 1, "www.sina.com", "10:01"),
("2018-01-01", 3, "www.baidu.com", "10:01"),
("2018-01-01", 3, "www.baidu.com", "10:01"),
("2018-01-01", 1, "www.sina.com", "10:01")
)
)
.toDF("day", "user_id", "page_id", "time")
.createOrReplaceTempView("t_page")
//窗口函数语法
spark
.sql("select * from (select user_id,page_id,num,rank() over(partition by user_id order by num desc) as rank from (select user_id,page_id,count(page_id) as num from t_page group by user_id,page_id)) where rank < 10")
.show()
//4.释放资源
spark.stop()
}
}
连续时间区间
object SparkSQLExample3 {
def main(args: Array[String]): Unit = {
//1.创建Spark SQL
val spark = SparkSession.builder().appName("quick example").master("local[*]").getOrCreate()
import spark.implicits._
val df = spark.sparkContext.textFile("D:\workspace\hadoop\Spark-review\src\main\resources").toDF
df
.flatMap(row => {
val line = row.getString(0)
val arr = line.split("\\s")
Array(arr(1),arr(2))
})
.createOrReplaceTempView("t_time")
spark
.sql("select value,max(value) over(order by value asc rows between 0 preceding and 1 following) as next_time from (select * from t_time order by value asc)")
.show()
//4.释放资源
spark.stop()
}
}