需求
1、写sparkSQL语句
package com.zch.spark.sql.exercise
import org.apache.spark.sql.SparkSession
/**
* Author: zhaoHui
* Date: 2021/12/24
* Time: 15:39
* Description: 1、使用SparkSQL在hive创建表
* 2、使用SparkSQL关联Hive查询数据
* 3、将查询出来的数据保存到新的表中
*/
object SpakSQL_Hive {
def main(args: Array[String]): Unit = {
// 输入校验参数
if (args == null || args.length != 3) {
println(
"""
|Parmeter error! Userage : <dataPath> <detailPath> <stockPath>
|""".stripMargin)
System.exit(-1)
}
// 将参数内容传递给数组的变量
val Array(dataPath, detailPath, stockPath) = args
// 支持hive的sparksession
val spark = new SparkSession.Builder()
.master("local[2]")
.appName("sparkHive")
.enableHiveSupport()
.getOrCreate()
// 建库
spark.sql(
"""
|CREATE DATABASE IF NOT EXISTS `tb_transaction`
|""".stripMargin)
// 使用库
spark.sql(
"""
|USE `tb_transaction`
|""".stripMargin)
// 建表 tbDate
spark.sql(
"""
| CREATE TABLE IF NOT EXISTS `tb_transaction`.`tbDate`(
| dateid date,
| years string,
| theyear string,
| month string,
| day string,
| weekday string,
| week string,
| quarter string,
| period string,
| halfmonth string
| ) ROW FORMAT DELIMITED
| FIELDS TERMINATED BY ','
|
|""".stripMargin)
//建表 tbStockDetail
spark.sql(
"""
|CREATE TABLE IF NOT EXISTS `tb_transaction`.`tbStockDetail`(
|ordernumber string,
|rownum string,
|itemid string,
|number string,
|price string,
|amount bigint
|)
|ROW FORMAT DELIMITED
| FIELDS TERMINATED BY ','
|""".stripMargin)
// 建表 tbStock
spark.sql(
"""
|CREATE TABLE IF NOT EXISTS `tb_transaction`.`tbStock`(
|ordernumber string,
|locationid string,
|dateid date
|)
|ROW FORMAT DELIMITED
| FIELDS TERMINATED BY ','
|""".stripMargin)
// 加载数据
spark.sql(
s"""
|LOAD DATA INPATH '${dataPath}' INTO TABLE `tb_transaction`.`tbDate`
|""".stripMargin)
spark.sql(
s"""
|LOAD DATA INPATH '${detailPath}' INTO TABLE `tb_transaction`.`tbStockDetail`
|""".stripMargin)
spark.sql(
s"""
|LOAD DATA INPATH '${stockPath}' INTO TABLE `tb_transaction`.`tbStock`
|""".stripMargin)
// 查询数据到新的表
//需求:统计所有订单中每年的销售单数、销售总额
//三个表连接后以count(distinct a.ordernumber)计销售单数,sum(b.amount)计销售总额
spark.sql(
"""
|create table `tb_transaction`.`case01` AS
|SELECT c.theyear theyear, COUNT(DISTINCT a.ordernumber) ordernumber, SUM(b.amount) amount
|FROM tbStock a
|JOIN tbStockDetail b ON a.ordernumber = b.ordernumber
|JOIN tbDate c ON a.dateid = c.dateid
|GROUP BY c.theyear
|ORDER BY c.theyear
|""".stripMargin)
// 需求:
//统计每年最大金额订单的销售额:
spark.sql(
"""
|create table `tb_transaction`.`case02` AS
|select td.theyear, max(c.amount_all) amount_max
|from (
| select ts.dateid dateid, sum(amount) amount_all
| from `tbStockDetail` tsd
| left join tbstock ts
| on tsd.ordernumber = ts.ordernumber
| group by ts.dateid
| ) c
| left join tbDate td
| on td.dateid = c.dateid
|where td.theyear is not null
|group by td.theyear
|order by td.theyear desc;
|""".stripMargin)
// 释放资源
spark.stop()
}
}
2、打包到集群,注意事项:
2.1 需要将 hive-site.xml core-site.xml hdfs-site.xml 加载到工程的resources目录下
3、配置打包依赖,使用maven自带的打包工具打包
<dependencies>
<dependency>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.5.5</version>
<dependency>
<dependency>
<groupId>org.scala-tools</groupId>
<artifactId>maven-scala-plugin</artifactId>
<version>2.15.2</version>
<dependency>
<dependency>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-eclipse-plugin</artifactId>
<version>2.10</version>
<dependency>
</dependencies>
<!--导入maven插件-->
<build>
<sourceDirectory>src/main/scala</sourceDirectory>
<plugins>
<!-- 对第三方依赖进行打包-->
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<configuration>
<archive>
<manifest>
<mainClass>com.zch.spark.sql.exercise.SpakSQL_Hive</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
4、上传至集群,开启hdfs,yarn
5、提交spark—submit任务
注意提前将需要载入到表中的数据上传到hdfs上
# Spark模式 yarn运行模式 运行主类
spark-submit --master yarn --deploy-mode client --class com.zch.spark.sql.exercise.SpakSQL_Hive
# jar包路径 加载文件的文件路径(hdfs路径)
/root/spark-1.0-SNAPSHOT-jar-with-dependencies.jar /tbDate.txt /tbStockDetail.txt /tbStock.txt