前置文章:
Spark SQL External Data Source 产生背景 & 概述 & 目标 & 使用
Spark SQL整合Hive使用
先在MySQL中创建数据库、表:
mysql> create database spark;
mysql> use spark;
mysql> CREATE TABLE DEPT(
DEPTNO int(2) PRIMARY KEY,
DNAME VARCHAR(14) ,
LOC VARCHAR(13) ) ;
mysql> INSERT INTO DEPT VALUES(10,'ACCOUNTING','NEW YORK');
mysql> INSERT INTO DEPT VALUES(20,'RESEARCH','DALLAS');
mysql> INSERT INTO DEPT VALUES(30,'SALES','CHICAGO');
mysql> INSERT INTO DEPT VALUES(40,'OPERATIONS','BOSTON');
代码:
/**
* 使用外部数据源综合查询Hive和MySQL的表数据
*/
object HiveMySQLApp {
def main(args: Array[String]) {
val spark = SparkSession.builder().appName("HiveMySQLApp")
.master("local[2]").getOrCreate()
// 加载Hive表数据
val hiveDF = spark.table("emp")
// 加载MySQL表数据
val mysqlDF = spark.read.format("jdbc").option("url", "jdbc:mysql://localhost:3306").option("dbtable", "spark.DEPT")
.option("user", "root")
.option("password", "root")
.option("driver", "com.mysql.jdbc.Driver")
.load()
// JOIN
val resultDF = hiveDF.join(mysqlDF, hiveDF.col("deptno") === mysqlDF.col("DEPTNO"))
resultDF.show
resultDF.select(hiveDF.col("empno"),hiveDF.col("ename"),
mysqlDF.col("deptno"), mysqlDF.col("dname")).show
spark.stop()
}
}