高级Spark SQL
JDBC接连
通过JDBC操作关系型数据库,加载到Spark中进行分析和处理
启动spark-shell(加载mysql驱动)spark-shell --master spark://hadoop1:7077 --jars /root/temp/mysql-connector-java-8.0.13.jar --driver-class-path /root/temp/mysql-connector-java-8.0.13.jar
一、使用Option连接Windows的Mysql
val mysql = spark.read.format("jdbc").option("url","jdbc:mysql://192.168.138.1:3306/data?serverTimezone=GMT%2B8").option("user","destiny").option("password","destiny").option("dbtable","log").load
mysql.show
二·、使用Properties类连接Windows的Mysql
import java.util.Properties
val property = new Properties()
property.setProperty("user","destiny")
property.setProperty("password","destiny")
val mysql = spark.read.jdbc("jdbc:mysql://192.168.138.1:3306/data?serverTimezone=GMT%2B8","log",property)
mysql.show
使用Hive
(1)将hdfs-site.xml、core-site.xml和hive-site.xml文件放到Spark的conf文件夹下
(2)启动Zookeeper和Hadoop
zkServer.sh start
start-all.sh
(3)在Master上启动Hive服务器
hive --service metastore
(4)在Slave上连接Hive客户端
hive
# 设置当前的数据块为default
set hive.cli.print.current.db=true
(5)在Slave上启动Spark
hive
./sbin/start-all.sh
spark-shell --master spark://hadoop2:7077
(6)操作Hive中的数据库(新建student表,并插入数据)
hive
spark.sql("create table default.student(studentID String,studentName String)row format delimited fields terminated by '\t'")
spark.sql("load data local inpath '/root/temp/studentSheet.txt' overwrite into table default.student")
IDEA创建DataFrame
一·、使用Schema创建package Spark
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
object DataFrame {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
//创建Spark Session对象
val spark = SparkSession.builder().appName("DataFrame").master("local").getOrCreate()
//创建sparkRDD
val sparkRDD = spark.sparkContext.textFile("F:\\IdeaProjects\\in\\studentSheet.txt").map(_.split("\t"))
//创建rowRDD
val rowRDD = sparkRDD.map(x => Row(x(0),x(1)))
//创建Schema
val schema = StructType(
List(StructField("id",StringType),StructField("name",StringType))
)
//创建DataFrame
val df = spark.createDataFrame(rowRDD,schema)
//创建视图
df.createOrReplaceTempView("student")
//查询student表
spark.sql("select * from student").show()
//关闭
spark.stop()
}
}
二、使用case class创建
package Spark
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SparkSession
case class stu(id: String,name: String)
object DataFrame {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
//创建Spark Session对象
val spark = SparkSession.builder().appName("DataFrame").master("local").getOrCreate()
//创建sparkRDD
val sparkRDD = spark.sparkContext.textFile("F:\\IdeaProjects\\in\\studentSheet.txt").map(x => x.split("\t"))
//创建studentRDD
val studentRDD = sparkRDD.map(x => stu(x(0),x(1)))
//创建DataFrame
import spark.sqlContext.implicits._
val df = studentRDD.toDF
//创建视图
df.createOrReplaceTempView("student")
//查询student表
spark.sql("select * from student").show()
//关闭
spark.stop()
}
}
结果
使用IDEA将spark结果导入MySQL
package Spark
import java.util.Properties
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types.{StringType, StructField, StructType}
object SparkMySQL {
def main(args: Array[String]): Unit = {
Logger.getLogger("org.apache.spark").setLevel(Level.ERROR)
//创建Spark Session对象
val spark = SparkSession.builder().appName("SparkMySQL").master("local").getOrCreate()
//创建sparkRDD
val sparkRDD = spark.sparkContext.textFile("F:\\IdeaProjects\\in\\studentSheet.txt").map(_.split("\t"))
//创建rowRDD
val rowRDD = sparkRDD.map(x => Row(x(0),x(1)))
//创建Schema
val schema = StructType(
List(StructField("id",StringType),StructField("name",StringType))
)
//创建DataFrame
val df = spark.createDataFrame(rowRDD,schema)
//创建视图
df.createOrReplaceTempView("student")
//查询student表
val result = spark.sql("select * from student")
result.show()
//将spark结果导入MySQL中
val property = new Properties()
property.setProperty("user","root")
property.setProperty("password","root")
property.setProperty("driver","com.mysql.cj.jdbc.Driver")
result.write.mode("overwrite").jdbc("jdbc:mysql://localhost:3306/data?serverTimezone=GMT%2B8","student",property)
//关闭
spark.stop()
}
}
结果
使用IDEA Spark操作Hive将结果保存在MySQL
scala代码package Spark
import java.util.Properties
import org.apache.spark.sql.SparkSession
object SparkHive {
def main(args: Array[String]): Unit = {
//实例化SparkSession对象
val spark = SparkSession.builder().appName("SparkHive").enableHiveSupport().getOrCreate()
//创建Spark SQL结果
val result = spark.sql("select id,name from default.test")
//实例化Properties对象
val property = new Properties()
//配置property
property.setProperty("user","destiny")
property.setProperty("password","destiny")
result.write.mode("append").jdbc("jdbc:mysql://192.168.138.1:3306/data?serverTimezone=GMT%2B8","stu",property)
//关闭
spark.stop()
}
}
File->Project Structure->Artifacts和Build->Build Artifacts->Rebuild打成jar包,上传到Linux服务器
spark命令spark-submit --master spark://hadoop2:7077 --jars /root/temp/mysql-connector-java-8.0.13.jar --driver-class-path /root/temp/mysql-connector-java-8.0.13.jar --class Spark.SparkHive /root/temp/SparkSQL.jar
结果