连接数据库和导入数据库
pom导入
<dependency> <groupId>mysql</groupId> <artifactId>mysql-connector-java</artifactId> <version>8.0.29</version> </dependency> <dependency> <groupId>org.apache.parquet</groupId> <artifactId>parquet-common</artifactId> <version>1.8.2</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-hive_2.11</artifactId> <version>2.3.4</version> </dependency>
java代码:
import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} import java.util.Properties object JDBCUtile { val url = "jdbc:mysql://192.168.52.146:3306/etclog" val driver = "com.mysql.cj.jdbc.Driver" val user = "root" val password = "ok" val frame = "frame"; private val prop = new Properties() prop.setProperty("user", user) prop.setProperty("password", password) prop.setProperty("dirver", driver) def dataFrameToMysql(df: DataFrame, table: String, op: Int = 1): Unit = { if (op == 0) { df.write.mode(SaveMode.Append).jdbc(url, table, prop) // } else if (op == 1) { df.write.mode(SaveMode.Overwrite).jdbc(url, table, prop) } } def dataFrameToParquet(df: DataFrame, output: String, op: Int = 1): Unit = { if (op == 0) { df.write.mode(SaveMode.Append).parquet(output) } else if (op == 1) { df.write.mode(SaveMode.Overwrite).parquet(output) } } //保存到Hive def dataFrameHive(df: DataFrame, table: String, op: Int = 1): Unit = { if (op == 0) { df.write.mode(SaveMode.Append).saveAsTable(table) //模式选择 } else if (op == 1) { df.write.mode(SaveMode.Overwrite).saveAsTable(table) } } //从MySQL中取数据到spark dataframe def getDataFrameFromMysql(spark: SparkSession, table: String): DataFrame = { val frame: DataFrame = spark.read.jdbc(url, table, prop) frame } //从Parquest中取数据到spark dataframe def getDataFrameFromParquest(spark: SparkSession, path: String): DataFrame = { val frame: DataFrame = spark.read.parquet(path) frame } //从hive中取数据到spark dataframe def getDataFrameFromHive(spark: SparkSession, table: String): DataFrame = { val frame: DataFrame = spark.sql("select * from " + table) frame } }
从MySQL导入数据处理演示(另外几个差不多),可以将数据库的数据进行处理再导进数据库
import org.apache.spark.sql.SparkSession object aaa { def main(args: Array[String]): Unit = { val spark=SparkSession.builder().appName("Student") .master("local[*]").getOrCreate() val Student = JDBCUtile.getDataFrameFromMysql(spark, "Student") val spark2=SparkSession.builder().appName("SC") .master("local[*]").getOrCreate() val SC = JDBCUtile.getDataFrameFromMysql(spark2, "SC") import spark.implicits._ //隐式类 val aaa=SC.filter(x=>x.getAs("CId")=="01") val bbb=SC.filter(x=>x.getAs("CId")=="02") val ccc=aaa.join(bbb,Seq("SId"),"inner").toDF("SID","CID1","score1","CID2","score2") .filter(x=>x.getAs("score1").toString.toDouble > x.getAs("score2").toString.toDouble).select("SID") val ddd=ccc.join(Student,Seq("SId"),"inner").show() spark.close() spark2.close() } }
将数据导入MySQL或者HIVE,Parquet也很简单,根据所要的类塞进去就可以了
//JDBCUtile.dataFrameToMysql(frame,JDBCUtile.frame) //将dataframe数据导入MySQL //JDBCUtile.dataFrameToParquet(frame,"out/fullog",op=1) //将dataframe数据导入Parquet //JDBCUtile.dataFrameHive(frame,"sss",1) //将dataframe数据导入Hive