1. 关于SparkSql的相关用法请参考
https://blog.csdn.net/weixin_37835915/article/details/103531043
https://blog.csdn.net/weixin_37835915/article/details/103599521
2. 关于hive的相关知识和用法请参考
https://blog.csdn.net/weixin_37835915/article/details/103311326
3.SparkSql + hive 直接上代码:
package com.spark.sql.hive
import org.apache.spark.sql.SparkSession
object SparkHiveExample {
case class Record(key: Int, value: String)
def main(args: Array[String]) {
val warehouseLocation = "C:/hnn/Project/Self/spark/spark-warehouse"
val spark = SparkSession
.builder()
.appName("Spark Hive Example")
.config("spark.sql.warehouse.dir", warehouseLocation)
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
import spark.implicits._
import spark.sql
sql("CREATE TABLE IF NOT EXISTS user (name STRING, age INT)")
sql("LOAD DATA LOCAL INPATH 'in/people1.txt' INTO TABLE user")
sql("SELECT * FROM user").show()
sql("SELECT COUNT(*) count FROM user").show()
val sqlDF = sql("SELECT name, age FROM user WHERE age > 10 ORDER BY age")
//创建一个临时表DataFrame
val recordsDF = spark.createDataFrame(Array(("AA",19),("BB",29),("CC",30))).toDF("name","age")
recordsDF.createOrReplaceTempView("user1")
// 可以将dataFrame的数据与hive里面的数据进行关联
sql("SELECT * FROM user1 r JOIN user s ON r.age = s.age").show()
spark.stop()
}
}
说明如下:
1.构建SparkSession的时候必须加上hive支持 .enableHiveSupport()
2.warehouseLocation 指定hive表数据储存的路劲
3.sql("CREATE TABLE IF NOT EXISTS user (name STRING, age INT)") 创建一张user表设置name和age两个字段
4.sql("LOAD DATA LOCAL INPATH 'in/people1.txt' INTO TABLE user") 从people1.txt文件中读取数据将数据保存到user表中
5.sql("SELECT * FROM user").show() 展示user表
6.运行后在此过程中可以观察spark-warehouse目录下会生成user这个文件夹,即说明创建表成功并成功写入hive
7.打开文件夹就会发现原始数据将会保存到次目录下
8.创建一个DataFrame临时表这时候hive 表可以与这个DataFrame进行关联代码如下
val recordsDF = spark.createDataFrame(Array(("AA",19),("BB",29),("CC",30))).toDF("name","age")
recordsDF.createOrReplaceTempView("user1")
sql("SELECT * FROM user1 r JOIN user s ON r.age = s.age").show()
注意:此操作不会将数据保存到hive中所以warehouse目录下不会产生相应文件运行结果如下
补充例子:合并两张表
package com.spark.self
import org.apache.spark.sql.SparkSession
object SparkSqlHiveTem {
def main(args: Array[String]): Unit = {
val warehouseLocation = "C:/hnn/Project/Self/spark/spark-warehouse"
val spark = SparkSession
.builder()
.appName("Spark Hive Example")
.config("spark.sql.warehouse.dir", warehouseLocation)
.master("local[*]")
.enableHiveSupport()
.getOrCreate()
import spark.sql
sql("CREATE TABLE IF NOT EXISTS user_basic_info (id INT, name STRING)")
sql("LOAD DATA LOCAL INPATH 'in/user_basic_info.txt' INTO TABLE user_basic_info")
sql("SELECT * FROM user_basic_info").show()
sql("CREATE TABLE IF NOT EXISTS user_address (name STRING,address STRING)")
sql("LOAD DATA LOCAL INPATH 'in/user_address.txt' INTO TABLE user_address")
sql("SELECT * FROM user_address").show()
sql("select max(ubi.id), ubi.name," +
" concat_ws(',', collect_set(ua.address)) as address " +
"from user_basic_info ubi " +
"join user_address ua " +
"on ubi.name=ua.name " +
"group by ubi.name").show()
}
}
运行结果如下: