概要
- Spark初始化、JDBC连接
- 定义Person Schema ,创建DataFrame, 数据结构化
- 注册表,简单SQL查询操作
- 输出结果:JSON 到 HDFS、JDBC的两种方式将结果写入MYSQL
环境:intellij idea maven + hadoop ha
代码片段
- Spark初始化
// HDFS开启了权限管理,设置用户名,避免HDFS读写权限拒绝
System.setProperty("HADOOP_USER_NAME", "hadoop");
// Spark初始化
val conf = new SparkConf().setAppName("SQL-SCHEMA").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlsc = new SQLContext(sc);
- 定义Person Schema ,创建DataFrame, 数据结构化
// 获取HDFS数据
val lineRDD = sc.textFile(args(0)).map(_.split(","))
val schema = StructType(List(
StructField("id",IntegerType,true),
StructField("name",StringType,true),
StructField("age",IntegerType,true)))
val rowRDD = lineRDD.map(x =>Row(x(0).toInt,x(1),x(2).toInt))
val personDF: DataFrame = sqlsc.createDataFrame(rowRDD, schema)
- 注册表,简单SQL查询操作
personDF.registerTempTable("t_person")
val sqlrs: DataFrame = sqlsc.sql("select * from t_person order by age desc limit 2")
- 输出结果:
- 1 降数据以JSON格式写入到HDFS;
- 2 用JDBC分别用两种方式将结果写入MYSQL
输出-MYSQL DF 1、 输出-MYSQL DF 2 两种方式可参考SCALA API
// 输出
// 输出-HDFS JSON
sqlrs.write.json(args(1));
// 输出-MYSQL DF 1
sqlrs.write.format("jdbc")
.option("url","jdbc:mysql://192.168.xx.xx:3306/sparktestresult")
.option("dbtable","t_person")
.option("user","root")
.option("password","123456")
.mode(SaveMode.Append).save()
// 输出-MYSQL DF 2
val connectionProperties = new Properties()
connectionProperties.setProperty("user","root")
connectionProperties.setProperty("password","123456")
sqlrs.write.mode("append").jdbc("jdbc:mysql://192.168.xx.xx:3306/sparktestresult","t_person",connectionProperties)
// 输出-MYSQL RDD 3
rowRDD.foreachPartition(InsertData)
查看DataFrame.write.jdbc(“xxx”) 源码,发现同write.format(“jdbc”).option(“xxx”,“xxx”) 使用本质相同
def jdbc(url: String, table: String, connectionProperties: Properties): Unit = {
assertNotPartitioned("jdbc")
assertNotBucketed("jdbc")
// connectionProperties should override settings in extraOptions.
this.extraOptions ++= connectionProperties.asScala
// explicit url and dbtable should override all
this.extraOptions += ("url" -> url, "dbtable" -> table)
format("jdbc").save()
}
源码
依赖
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-hive_2.11</artifactId>
<version>2.3.0</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-mllib -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-mllib_2.11</artifactId>
<version>2.3.0</version>
<scope>runtime</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>2.7.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
<dependency>
<groupId>mysql</groupId>
<artifactId>mysql-connector-java</artifactId>
<version>5.1.38</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.3.0</version>
<!-- <scope>provided</scope>-->
</dependency>
</dependencies>
App
import java.util.Properties
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import java.sql.{Connection, PreparedStatement}
object SparkSQL_Person_Schema {
val connProps :Properties = new Properties()
connProps.setProperty("user","root")
connProps.setProperty("password","123456")
val url = "jdbc:mysql://192.168.xx.xx:3306/sparktestresult"
val conn :Connection = dbUtils.getConnection("mysql",url,connProps)
def InsertData(iter:Iterator[Row]) = {
try {
iter.foreach(row =>{
val valueseq: Seq[Any] = row.toSeq //迭代器遍历出来的是ROWRDD,这里转成 SEQ
val id = valueseq(0);
val name = valueseq(1);
val age = valueseq(2);
val statement: PreparedStatement = conn.prepareStatement("insert into t_person(id,name,age) value(?,?,?)")
statement.setInt(1,id.asInstanceOf[Int])
statement.setString(2,name.asInstanceOf[String])
statement.setInt(3,age.asInstanceOf[Int])
statement.executeUpdate()
statement.close()
})
}catch{
case ex:Exception => //handle exception
} finally {
}
}
def main(args: Array[String]): Unit = {
// HDFS开启了权限管理,设置用户名,避免HDFS读写权限拒绝
System.setProperty("HADOOP_USER_NAME", "hadoop");
// Spark初始化
val conf = new SparkConf().setAppName("SQL-SCHEMA").setMaster("local[*]")
val sc = new SparkContext(conf)
val sqlsc = new SQLContext(sc);
// 获取HDFS数据
val lineRDD = sc.textFile(args(0)).map(_.split(","))
val schema = StructType(List(
StructField("id",IntegerType,true),
StructField("name",StringType,true),
StructField("age",IntegerType,true)))
val rowRDD = lineRDD.map(x =>Row(x(0).toInt,x(1),x(2).toInt))
val personDF: DataFrame = sqlsc.createDataFrame(rowRDD, schema)
// 注册表
personDF.registerTempTable("t_person")
val sqlrs: DataFrame = sqlsc.sql("select * from t_person order by age desc limit 2")
// 输出
// 输出-HDFS JSON
sqlrs.write.json(args(1));
/*
// 输出-MYSQL DF 1
sqlrs.write.format("jdbc")
.option("url","jdbc:mysql://192.168.xx.xx:3306/sparktestresult")
.option("dbtable","t_person")
.option("user","root")
.option("password","123456")
.mode(SaveMode.Append).save()
// 输出-MYSQL DF 2
val connectionProperties = new Properties()
connectionProperties.setProperty("user","root")
connectionProperties.setProperty("password","123456")
sqlrs.write.mode("append").jdbc("jdbc:mysql://192.168.xx.xx:3306/sparktestresult","t_person",connectionProperties)
*/
// 输出-MYSQL RDD 3
rowRDD.foreachPartition(InsertData)
sc.stop()
}
}
dbUtils
import java.sql.{Connection, DriverManager}
import java.util.Properties
package object dbUtils {
// driver load
def driverLoad(dbType:String) = {
try {
dbType match{
case "mysql" => Class.forName("com.mysql.jdbc.Driver")
case "db2" | "oracle" | _ => "pending"
}
}catch{
case ex:Exception => // handle driver load exception
}
}
// getconnection
def getConnection(dbType:String,url:String,connProps:Properties):Connection = {
this.driverLoad(dbType)
DriverManager.getConnection(url,connProps.getProperty("user"),connProps.getProperty("password"))
}
// free connection
def close(conn:Connection)= {
try{
if(conn != null || !conn.isClosed) conn.close
}catch{
case ex:Exception => // handle close exception
}
}
}
输入、输出结果
hadoop fs -cat hdfs://mycluster/person/data.csv
1,xiaoming,12
2,xiaoli,13
3,mayun,18