Spark-SQL模块
文章目录
需求
基于Spark SQL完成需求开发
- 婚姻状况对合同签订的影响
- 联系方式对合同签订的影响
- 季节变换合同签订的影响
创建数据库
create table `mari_con_stat`(
`marital` varchar(20) default null,
`num` int(11) default null
);
create table `contact_con_stat`(
`contact` varchar(20) default null,
`num` int(11) default null
);
create table `month_con_stat`(
`month` varchar(20) default null,
`num` int(11) default null
);
IDEA中编码开发
sparksql.MaritalContractStatSql
package sparksql
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* @Classname MaritalContractStatSql
* @Description 年龄对营销的分析
* @Date 2022/10/7 16:42
* @Created by Tiger_Li
*/
/**
* 年龄对营销的影响
*/
object MaritalContractStatSql {
def main(args:Array[String]):Unit = {
if(args.length != 2){
println("Usage: spark-submit * *.jar masterurl SaveMode[overwrite|append|ignore]")
System.exit(1)
}
// 1. 创建SparkConf并配置master和appname
val MasterUrl = args(0)
val mySaveMode = args(1)
val conf = new SparkConf().setAppName(s"${this.getClass.getSimpleName}").setMaster(MasterUrl)
// 2. 创建SparkSession并配置Hive支持
val spark = SparkSession.builder().enableHiveSupport().config(conf).getOrCreate()
// 3. 读取hive数据源
val dataSource = spark.sql("select marital, poutcome from dwd.dwd_t_bank where marital !='' and marital is not null")
// 4. 注册为临时表
dataSource.createOrReplaceTempView("t_bank_temp")
// 5. SQL处理
val resdf: DataFrame = spark.sql("select marital,count(*) as num from t_bank_temp where poutcome = 'success' group by marital")
val url="jdbc:mysql://node1:3306/bigdata19?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC&useSSL=false"
val table="mari_con_stat"
val prop=new java.util.Properties()
prop.put("driver","com.mysql.cj.jdbc.Driver")
prop.put("user","lh")
prop.put("password","ypassword")
resdf.write.mode(mySaveMode).jdbc(url,table,prop)
spark.stop()
}
}
sparksql.ContactContractStatSql
package sparksql
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* @Classname ContactContractStatSql
* @Description 年龄对营销的分析
* @Date 2022/10/7 16:42
* @Created by Tiger_Li
*/
/**
* 年龄对营销的影响
*/
object ContactContractStatSql {
def main(args:Array[String]):Unit = {
if(args.length != 2){
println("Usage: spark-submit * *.jar masterurl SaveMode[overwrite|append|ignore]")
System.exit(1)
}
// 1. 创建SparkConf并配置master和appname
val MasterUrl = args(0)
val mySaveMode = args(1)
val conf = new SparkConf().setAppName(s"${this.getClass.getSimpleName}").setMaster(MasterUrl)
// 2. 创建SparkSession并配置Hive支持
val spark = SparkSession.builder().enableHiveSupport().config(conf).getOrCreate()
// 3. 读取hive数据源
val dataSource = spark.sql("select contact, poutcome from dwd.dwd_t_bank where contact !='' and contact is not null")
// 4. 注册为临时表
dataSource.createOrReplaceTempView("t_bank_temp")
// 5. SQL处理
val resdf: DataFrame = spark.sql("select contact,count(*) as num from t_bank_temp where poutcome = 'success' group by contact")
val url="jdbc:mysql://node1:3306/bigdata19?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC&useSSL=false"
val table="contact_con_stat"
val prop=new java.util.Properties()
prop.put("driver","com.mysql.cj.jdbc.Driver")
prop.put("user","lh")
prop.put("password","ypassword")
resdf.write.mode(mySaveMode).jdbc(url,table,prop)
spark.stop()
}
}
sparksql.monthContractStatSql
package sparksql
import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, SparkSession}
/**
* @Classname monthContractStatSql
* @Description 年龄对营销的分析
* @Date 2022/10/7 16:42
* @Created by Tiger_Li
*/
/**
* 年龄对营销的影响
*/
object monthContractStatSql {
def main(args:Array[String]):Unit = {
if(args.length != 2){
println("Usage: spark-submit * *.jar masterurl SaveMode[overwrite|append|ignore]")
System.exit(1)
}
// 1. 创建SparkConf并配置master和appname
val MasterUrl = args(0)
val mySaveMode = args(1)
val conf = new SparkConf().setAppName(s"${this.getClass.getSimpleName}").setMaster(MasterUrl)
// 2. 创建SparkSession并配置Hive支持
val spark = SparkSession.builder().enableHiveSupport().config(conf).getOrCreate()
// 3. 读取hive数据源
val dataSource = spark.sql("select month_of_year as month, poutcome from dwd.dwd_t_bank where month_of_year !='' and month_of_year is not null")
dataSource.show(5)
dataSource.printSchema()
// 4. 注册为临时表
dataSource.createOrReplaceTempView("t_bank_temp")
// 5. SQL处理
val resdf: DataFrame = spark.sql("select month,count(*) as num from t_bank_temp where poutcome = 'success' group by month")
resdf.show(5,false)
resdf.printSchema()
val url="jdbc:mysql://node1:3306/bigdata19?useUnicode=true&characterEncoding=UTF-8&serverTimezone=UTC&useSSL=false"
val table="month_con_stat"
val prop=new java.util.Properties()
prop.put("driver","com.mysql.cj.jdbc.Driver")
prop.put("user","lh")
prop.put("password","ypassword")
resdf.write.mode(mySaveMode).jdbc(url,table,prop)
spark.stop()
}
}
完成代码编写后通过maven打包架包
部署运行
启动hadoop,启动hive,启动spark
# 启动hadoop
/opt/soft_installed/hadoop-2.7.3/sbin/start-dfs.sh
/opt/soft_installed/hadoop-2.7.3/sbin/start-yarn.sh
/opt/soft_installed/hadoop-2.7.3/sbin/mr-jobhistory-daemon.sh start historyserver
# 启动hive
nohup hive --service metastore >$HIVE_LOG_DIR/metastore.log 2>&1 &
# 启动spark
/opt/soft_installed/spark-2.4.8-bin-hadoop2.7/sbin/start-all.sh
/opt/soft_installed/spark-2.4.8-bin-hadoop2.7/sbin/start-history-server.sh
创建shell脚本运行spark SQL版本的营销分析
# 婚姻状况对营销的影响
# sparksql.MaritalContractStatSql
/opt/soft_installed/spark-2.4.8-bin-hadoop2.7/bin/spark-submit \
--name MCSSQL \
--class sparksql.MaritalContractStatSql \
--master yarn \
--deploy-mode cluster \
--driver-memory 512m \
--driver-cores 1 \
--executor-memory 512m \
--num-executors 1 \
--executor-cores 1 \
--driver-class-path /home/lh/softs/mysql-connector-java-8.0.30.jar \
/home/lh/hadooptest/jars/GUNbank-V2-jar-with-dependencies.jar \
spark://node1:7077 overwrite
# 练习方式对营销的影响
# sparksql.ContactContractStatSql
/opt/soft_installed/spark-2.4.8-bin-hadoop2.7/bin/spark-submit \
--name CCSSQL \
--class sparksql.ContactContractStatSql \
--master yarn \
--deploy-mode cluster \
--driver-memory 512m \
--driver-cores 1 \
--executor-memory 512m \
--num-executors 1 \
--executor-cores 1 \
--driver-class-path /home/lh/softs/mysql-connector-java-8.0.30.jar \
/home/lh/hadooptest/jars/GUNbank-V2-jar-with-dependencies.jar \
spark://node1:7077 overwrite
# 月份对营销的影响
# sparksql.monthContractStatSql
/opt/soft_installed/spark-2.4.8-bin-hadoop2.7/bin/spark-submit \
--name MonthCSSQL \
--class sparksql.monthContractStatSql \
--master yarn \
--deploy-mode cluster \
--driver-memory 512m \
--driver-cores 1 \
--executor-memory 512m \
--num-executors 1 \
--executor-cores 1 \
--driver-class-path /home/lh/softs/mysql-connector-java-8.0.30.jar \
/home/lh/hadooptest/jars/GUNbank-V2-jar-with-dependencies.jar \
spark://node1:7077 overwrite
结果检查
Spark history
Job
stage
executors
数据库
连接数据库检查