def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("ScalaSparkSQLJDBCOps").setMaster("local") conf.set("spark.sql.shuffle.partitions", "1") val sc = new SparkContext(conf) val sqlContext = new SQLContext(sc) /** * 使用text的这种方式加载数据的时候,每一行数据只会有一列,而且列名为Value,这种方式满足不了我们预期 */ val tbDF = sqlContext.read.format("text").load("E:/test/spark/sql/teacher_basic.txt") val tbDF = sqlContext.read.text("E:/test/spark/sql/teacher_basic.txt") tbDF.show() val tiDF = sqlContext.read.text("E:/test/spark/sql/teacher_info.txt") tiDF.show() /** * 将两张表中的数据加载到rdd,然后在将rdd转化成df */ val tbLinesRDD = sc.textFile("E:/test/spark/sql/teacher_basic.txt") val tbTupleRDD = tbLinesRDD.map(line => { //name,age,married,children val splits = line.split(",") val name = splits(0).trim val age = splits(1).trim.toInt val married = splits(2).trim.toBoolean val children = splits(3).trim.toInt (name, age, married, children) }) //tbTupleRDD--->tbDF // (需要借助元数据信息StructType,或者直接使用toDF[需要隐士转换] 如果使用toDF的时候,rdd的泛型不能为Row) import sqlContext.implicits._ val tbDF = tbTupleRDD.toDF("name", "age", "married", "children") //同理 tiDF val tiLinesRDD = sc.textFile("E:/test/spark/sql/teacher_info.txt") val tiTupleRDD = tiLinesRDD.map(line => { //name,height val splits = line.split(",") val name = splits(0).trim val height = splits(1).trim.toInt (name, height) }) val tiDF = tiTupleRDD.toDF("name", "height") tiDF.printSchema() tiDF.show() /** * 2 tbDF.join(tiDF) */ val joinedDF = tbDF.join(tiDF, "name") //$"df1Key" === $"df2Key" // val joinedDF = tbDF.join(tiDF) // joinedDF.show() } /** * 批量sql * * @param joinedDF */ def save2DB_5(joinedDF: DataFrame): Unit = { joinedDF.foreachPartition(rows => { //判断当前分区中的数据是否为空,如果为空跳过,不为空执行下面的操作 if (!rows.isEmpty) { classOf[com.mysql.jdbc.Driver] val url = "jdbc:mysql://localhost:3306/bigdata_db" val username = "root" val password = "root" val connection = DriverManager.getConnection(url, username, password) //insert into teachers values(null, "xxxx", 175, 20, 1, 2), (null, "xxxx", 175, 20, 1, 2) val sb = new java.lang.StringBuilder("INSERT INTO teachers VALUES(NULL, ") val st = connection.createStatement() for (row <- rows) { sb.append("'" + row.getAs[String]("name") + "', ") .append(row.getAs[Int]("height") + ", ") .append(row.getAs[Int]("age") + ", ") .append(row.getAs[Boolean]("married") + ", ") .append(row.getAs[Int]("children")) .append("),(NULL, ") } val sql = sb.substring(0, sb.lastIndexOf(",(NULL, ")) // println(sql) st.execute(sql) st.close() connection.close() } }) } /** * 优化后的第二种落地方式:foreachPartition * * @param joinedDF */ def save2DB_3(joinedDF: DataFrame): Unit = { joinedDF.foreachPartition(rows => { //rows是一个分区中的所有的row的集合 classOf[com.mysql.jdbc.Driver] val url = "jdbc:mysql://localhost:3306/bigdata_db" val username = "root" val password = "root" val connection = DriverManager.getConnection(url, username, password) val sql = "INSERT INTO teachers VALUES(NULL, ?, ?, ?, ?, ?)" val ps = connection.prepareStatement(sql) for (row <- rows) { ps.setString(1, row.getAs[String]("name")) ps.setInt(2, row.getAs[Int]("height")) ps.setInt(3, row.getAs[Int]("age")) ps.setBoolean(4, row.getAs[Boolean]("married")) ps.setInt(5, row.getAs[Int]("children")) ps.execute() } ps.close() connection.close() }) } /** * 第二种落地方式,foreach * * @param joinedDF */ def save2DB_2(joinedDF: DataFrame): Unit = { joinedDF.foreach(row => { //1、注册驱动 //Class.forName("com.mysql.jdbc.Driver") classOf[com.mysql.jdbc.Driver] val url = "jdbc:mysql://localhost:3306/bigdata_db" val username = "root" val password = "root" //2、通过驱动管理器,获得db连接 val connection = DriverManager.getConnection(url, username, password) val sql = "INSERT INTO teachers VALUES(NULL, ?, ?, ?, ?, ?)" // 3、获得statement声明 val ps = connection.prepareStatement(sql) ps.setString(1, row.getAs[String]("name")) ps.setInt(2, row.getAs[Int]("height")) ps.setInt(3, row.getAs[Int]("age")) ps.setBoolean(4, row.getAs[Boolean]("married")) ps.setInt(5, row.getAs[Int]("children")) //4、执行sql ps.execute() //5、处理返回值,可选 //6、关闭资源 ps.close() connection.close() }) } def save2DB_1(joinedDF: DataFrame): Unit = { //使用write.jdbc的时候,如果当前表不存在,则会在数据库中创建一张相应的表 val url = "jdbc:mysql://localhost:3306/bigdata_db" val tableName = "teachers" val properties = new Properties() properties.setProperty("user", "root") properties.setProperty("password", "root") joinedDF.select("name", "age").write.mode(SaveMode.Append).jdbc(url, tableName, properties) } /** * 从mysql中读取数据 * * @param sqlContext */ def readFromJDBC(sqlContext: SQLContext): Unit = { val url = "jdbc:mysql://localhost:3306/test" val table = "score" val properties = new Properties() properties.setProperty("user", "root") properties.setProperty("password", "root") //这种方式是全量读取一张表中的数据 val df = sqlContext.read.jdbc(url, table, properties) df.show() }
SparkSQL之JDBC
最新推荐文章于 2024-06-01 08:50:02 发布