一、实现功能
配置Spark到hive和mysql的链接,然后读取数据,最终结果输出存储到hive中。
二、环境条件
1.hdfs
2.metastore
3.mysql
三、实现功能
1.从hive中读取数据
// 1. 创建上下文
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("hive-join-mysql")
.set("spark.sql.shuffle.partitions","10")
val sc = SparkContext.getOrCreate(conf)
val sqlContext = new HiveContext(sc)
sqlContext
.read
.table("default.dept")
2.连接mysql配置
// 0. 定义常量信息
// lazy val driver="com.mysql.jdbc.Driver"
lazy val url="jdbc:mysql://bigdata.ibeifeng.com:3306/test"
lazy val user="root"
lazy val password= "123456"
lazy val props=new Properties()
props.put("user",user)
props.put("password",password)
// 1. 创建上下文
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("hive-join-mysql")
.set("spark.sql.shuffle.partitions","10")
val sc = SparkContext.getOrCreate(conf)
val sqlContext = new HiveContext(sc)
sqlContext
.read
.table("default.dept")
.write
.mode(SaveMode.Overwrite)
.jdbc(url, "tb_dept", props)
3.从mysql中读取数据
// 0. 定义常量信息
// lazy val driver="com.mysql.jdbc.Driver"
lazy val url="jdbc:mysql://bigdata.ibeifeng.com:3306/test"
lazy val user="root"
lazy val password= "123456"
lazy val props=new Properties()
props.put("user",user)
props.put("password",password)
// 1. 创建上下文
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("hive-join-mysql")
.set("spark.sql.shuffle.partitions","10")
val sc = SparkContext.getOrCreate(conf)
val sqlContext = new HiveContext(sc)
sqlContext
.read
.jdbc(url, "tb_dept", Array("deptno < 25", "deptno >=25 AND deptno < 28", "deptno >= 28"), props)
.registerTempTable("tmp_tb_dept")
val joinResultDF = sqlContext.sql("select a.*,b.dname,b.loc from default.emp a join tmp_tb_dept b on a.deptno = b.deptno")
joinResultDF.show()
4.输出到hdfs
joinResultDF
.write
.format("parquet")
.mode(SaveMode.Overwrite)
.partitionBy("deptno")
.saveAsTable("default.tb_result_join_emp_dept2")
5.整体代码
package _0729DF
import java.util.Properties
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{SQLContext, SaveMode, SparkSession}
import org.apache.spark.sql.hive.HiveContext
/**
*
*/
object HiveJoinMysql extends App{
// 0. 定义常量信息
// lazy val driver="com.mysql.jdbc.Driver"
lazy val url="jdbc:mysql://bigdata.ibeifeng.com:3306/test"
lazy val user="root"
lazy val password= "123456"
lazy val props=new Properties()
props.put("user",user)
props.put("password",password)
// 1. 创建上下文
val conf = new SparkConf()
.setMaster("local[*]")
.setAppName("hive-join-mysql")
.set("spark.sql.shuffle.partitions","10")
val sc = SparkContext.getOrCreate(conf)
val sqlContext = new HiveContext(sc)
//
// sqlContext.table("default.dept").show()
//
// sqlContext.table("default.dept").show()
// 需求一:将hive中的数据同步到mysql中
// 通过write.jdbc将数据写出到RDBMs有一个小问题,一般情况下不使用该方式将数据写出到关系型数据库中
// 没法实现Insert Or Update的操作(基于数据记录进行更新和插入的操作)
// TODO: 考虑一下到底如何将数据写入RDBMs比较好==>考虑RDD写出数据的方式
// sqlContext
// .read
// .table("default.dept")
// .write
// .mode(SaveMode.Overwrite)
// .jdbc(url, "tb_dept", props)
/**
*
* mysql> select * from tb_dept;
+--------+------------+----------+
| deptno | dname | loc |
+--------+------------+----------+
| 10 | ACCOUNTING | NEW YORK |
| 20 | RESEARCH | DALLAS |
| 30 | SALES | CHICAGO |
| 40 | OPERATIONS | BOSTON |
+--------+------------+----------+
*
*
*/
//------------------------------------------------------------------------------
// 需求二:hive表和mysql中的表进行join操作【20180730update】
/**
* 步骤:
* 1. 读取mysql中的数据形成DataFrame
* 2. 将DataFrame注册成为临时表(tmp_tb_dept)
* 注意:在注册临时表的时候,临时表中不能出现"."这个符号
* 3. 将tmp_tb_dept和common.emp进行数据join
*/
sqlContext
.read
.jdbc(url, "tb_dept", Array("deptno < 25", "deptno >=25 AND deptno < 28", "deptno >= 28"), props)
.registerTempTable("tmp_tb_dept")
val joinResultDF = sqlContext.sql("select a.*,b.dname,b.loc from default.emp a join tmp_tb_dept b on a.deptno = b.deptno")
joinResultDF.show()
/**
* 结果:
* +-----+------+---------+----+----------+------+------+------+----------+--------+
|empno| ename| job| mgr| hiredate| sal| comm|deptno| dname| loc|
+-----+------+---------+----+----------+------+------+------+----------+--------+
| 7934|MILLER| CLERK|7782| 1982-1-23|1300.0| null| 10|ACCOUNTING|NEW YORK|
| 7839| KING|PRESIDENT|null|1981-11-17|5000.0| null| 10|ACCOUNTING|NEW YORK|
| 7782| CLARK| MANAGER|7839| 1981-6-9|2450.0| null| 10|ACCOUNTING|NEW YORK|
| 7902| FORD| ANALYST|7566| 1981-12-3|3000.0| null| 20| RESEARCH| DALLAS|
| 7876| ADAMS| CLERK|7788| 1987-5-23|1100.0| null| 20| RESEARCH| DALLAS|
| 7788| SCOTT| ANALYST|7566| 1987-4-19|3000.0| null| 20| RESEARCH| DALLAS|
| 7566| JONES| MANAGER|7839| 1981-4-2|2975.0| null| 20| RESEARCH| DALLAS|
| 7369| SMITH| CLERK|7902|1980-12-17| 800.0| null| 20| RESEARCH| DALLAS|
| 7900| JAMES| CLERK|7698| 1981-12-3| 950.0| null| 30| SALES| CHICAGO|
| 7844|TURNER| SALESMAN|7698| 1981-9-8|1500.0| 0.0| 30| SALES| CHICAGO|
| 7698| BLAKE| MANAGER|7839| 1981-5-1|2850.0| null| 30| SALES| CHICAGO|
| 7654|MARTIN| SALESMAN|7698| 1981-9-28|1250.0|1400.0| 30| SALES| CHICAGO|
| 7521| WARD| SALESMAN|7698| 1981-2-22|1250.0| 500.0| 30| SALES| CHICAGO|
| 7499| ALLEN| SALESMAN|7698| 1981-2-20|1600.0| 300.0| 30| SALES| CHICAGO|
+-----+------+---------+----+----------+------+------+------+----------+--------+
*/
//------------------------------------------------------------------------------
// 需求三:【存储】将数据保存到HDFS上格式为parquet
joinResultDF
.write
.format("parquet")
.mode(SaveMode.Overwrite)
.partitionBy("deptno")
.save("result/sql/parquet/01") // 指定的是hdfs上的文件,如果没有给定fs.defaultFS,默认是保存到本地的
/**
* 结果:
* E:\Tools\WorkspaceforMyeclipse\scalaProjectMaven\result\sql\parquet下面有结果!
*/
//【存储到hive:20180730update】
// 存储到hive中==>将数据存储到Hive中,数据格式为Parquet
joinResultDF
.write
.format("parquet")
.mode(SaveMode.Overwrite)
.partitionBy("deptno")
.saveAsTable("default.tb_result_join_emp_dept2")
/**
* hive结果:
* hive (default)> show tables;
OK
tab_name
casttest
casttest2
dept
emp
emp_part
emp_part2
jihetable
student
tb_result_join_emp_dept
u_data
u_data_new
yhd_source
Time taken: 0.166 seconds, Fetched: 12 row(s)
*/
//------------------------------------------------------------------------------
}