大概思想是先取数据,再对数据进行处理,最后将结果保存进Mysql
package DataToMysql
import java.sql.{Connection, DriverManager, PreparedStatement}
import java.text.SimpleDateFormat
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{DataFrame, Row, SaveMode, SparkSession}
import scala.collection.mutable.ListBuffer
case class Range(colName:String,colType:String,range:String,createTime:String)
object GetRangeToMysql{
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("kkk").setMaster("local[*]")
val spark = SparkSession.builder().config(conf).getOrCreate()
//spark读取hdfs上的json文件
val frame = spark.read.json("hdfs://192.168.216.201:9000/output/aaa")
//连接数据库的参数
val url = "jdbc:mysql://localhost:3306/selftest"
val table = "t1"
val user = "root"
val pwd = "123456"
//获取frame中的字段名和对应的数据类型
val dtypes = frame.dtypes
//使用map方法,判断类型,如果是数值类型或者date类型那就求取最值,拼接成数值范围,如果是字符串类型,那就给一个"-"
val unionArray: Array[(String, String,String)] = dtypes.map(t => {
val col = t._1
if (t._2.equals("IntType") || t._2.equals("LongType") || t._2.equals("DoubleType")) {
val arr = getMaximum(frame,col)
(col,"数值类型",arr(1) + "-" + arr(0))
} else if (t._2.equals("DateType")) {
val arr = getMaximum(frame,col)
(col,"时间类型",arr(1) + "-" + arr(0))
} else {
(t._1, "字符串类型","-")
}
})
//把数组类型的数据转成rdd
val unionRDD: RDD[(String, String, String)] = spark.sparkContext.parallelize(unionArray)
val columns = Array("col","num","numRange")
//根据字段名把rdd创建成一个dataframe
val unionFrame = spark.createDataFrame(unionRDD).toDF(columns: _*)
//给frame的schema增加一个字段
val schema: StructType = unionFrame.schema.add(StructField("createTime", StringType))
val sdt = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
val date = sdt.format(System.currentTimeMillis())
//把新的一列数据合并到frame对应的rdd中
val mergeRDD = unionFrame.rdd.map(data => Row.merge(data, Row(date)))
//获得新的frame
val mergeFrame = spark.createDataFrame(mergeRDD, schema)
//通过样例类的形式把数据加到一个ListBuffer里面,然后调用自定义插入mysql的方法,将数据插入到mysql
mergeFrame.foreachPartition(data =>{
val list = new ListBuffer[Range]
data.foreach(info =>{
val colName = info.getAs[String]("col")
val colType = info.getAs[String]("num")
val range = info.getAs[String]("numRange")
val createTime = info.getAs[String]("createTime")
list.append(Range(colName,colType,range,createTime))
})
insertIntoMysql(url,user,pwd,table,list)
})
spark.close()
}
//获取最值得方法
def getMaximum(frame:DataFrame,colName:String):Array[String] = {
import org.apache.spark.sql.functions._
val maxNum = frame.select(max(colName)).rdd.take(1).mkString("")
val minNum = frame.select(min(colName)).rdd.take(1).mkString("")
val nums = Array(maxNum.substring(1,maxNum.length - 1),minNum.substring(1,maxNum.length - 1))
nums
}
//获取数据库连接的方法
def getConnection(url:String,userName:String,pwd:String)={
DriverManager.getConnection(url,userName,pwd)
}
//关闭数据库连接的方法
def release(conn:Connection,psmt:PreparedStatement) = {
try{
if(psmt != null){
psmt.close()
}
}catch{
case e:Exception => e.printStackTrace()
}finally {
if(conn != null){
conn.close()
}
}
}
//自定义将数据插入到mysql的方法
def insertIntoMysql(url:String,user:String,pwd:String,table:String,list:ListBuffer[Range])={
var conn:Connection = null
var psmt:PreparedStatement = null
try{
conn = getConnection(url,user,pwd)
conn.setAutoCommit(false)
val sql = "insert into " + table + "(colName,colType,`range`,createTime) values(?,?,?,?)"
psmt = conn.prepareStatement(sql)
for(ele <- list){
psmt.setString(1,ele.colName)
psmt.setString(2,ele.colType)
psmt.setString(3,ele.range)
psmt.setString(4,ele.createTime)
psmt.addBatch()
}
psmt.executeBatch()
conn.commit()
}catch {
case e:Exception => e.printStackTrace()
}finally {
release(conn,psmt)
}
}
}