SCALA-Spark SQL 查询、输出demo

概要

  1. Spark初始化、JDBC连接
  2. 定义Person Schema ,创建DataFrame, 数据结构化
  3. 注册表,简单SQL查询操作
  4. 输出结果:JSON 到 HDFS、JDBC的两种方式将结果写入MYSQL
环境:intellij idea maven + hadoop ha

代码片段

  • Spark初始化
    // HDFS开启了权限管理,设置用户名,避免HDFS读写权限拒绝
    System.setProperty("HADOOP_USER_NAME", "hadoop");

    // Spark初始化
    val conf = new SparkConf().setAppName("SQL-SCHEMA").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val sqlsc = new SQLContext(sc);
  • 定义Person Schema ,创建DataFrame, 数据结构化
    // 获取HDFS数据
    val lineRDD = sc.textFile(args(0)).map(_.split(","))

    val schema = StructType(List(
      StructField("id",IntegerType,true),
      StructField("name",StringType,true),
      StructField("age",IntegerType,true)))

    val rowRDD = lineRDD.map(x =>Row(x(0).toInt,x(1),x(2).toInt))
    val personDF: DataFrame = sqlsc.createDataFrame(rowRDD, schema)
  • 注册表,简单SQL查询操作
personDF.registerTempTable("t_person")
val sqlrs: DataFrame = sqlsc.sql("select * from t_person order by age desc limit 2")
  • 输出结果:
  • 1 降数据以JSON格式写入到HDFS;
  • 2 用JDBC分别用两种方式将结果写入MYSQL

输出-MYSQL DF 1、 输出-MYSQL DF 2 两种方式可参考SCALA API

// 输出
    // 输出-HDFS JSON
    sqlrs.write.json(args(1));

    // 输出-MYSQL DF 1
    sqlrs.write.format("jdbc")
      .option("url","jdbc:mysql://192.168.xx.xx:3306/sparktestresult")
      .option("dbtable","t_person")
      .option("user","root")
      .option("password","123456")
      .mode(SaveMode.Append).save()

    // 输出-MYSQL DF 2
    val connectionProperties = new Properties()
    connectionProperties.setProperty("user","root")
    connectionProperties.setProperty("password","123456")
    sqlrs.write.mode("append").jdbc("jdbc:mysql://192.168.xx.xx:3306/sparktestresult","t_person",connectionProperties)

    // 输出-MYSQL RDD 3
    rowRDD.foreachPartition(InsertData)

查看DataFrame.write.jdbc(“xxx”) 源码,发现同write.format(“jdbc”).option(“xxx”,“xxx”) 使用本质相同

  def jdbc(url: String, table: String, connectionProperties: Properties): Unit = {
    assertNotPartitioned("jdbc")
    assertNotBucketed("jdbc")
    // connectionProperties should override settings in extraOptions.
    this.extraOptions ++= connectionProperties.asScala
    // explicit url and dbtable should override all
    this.extraOptions += ("url" -> url, "dbtable" -> table)
    format("jdbc").save()
  }

源码

依赖

<dependencies>

    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-hive_2.11</artifactId>
        <version>2.3.0</version>
        <scope>provided</scope>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-core -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-core_2.11</artifactId>
        <version>2.3.0</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-sql_2.11</artifactId>
        <version>2.3.0</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-mllib -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-mllib_2.11</artifactId>
        <version>2.3.0</version>
        <scope>runtime</scope>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-client -->
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>2.7.3</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/mysql/mysql-connector-java -->
    <dependency>
        <groupId>mysql</groupId>
        <artifactId>mysql-connector-java</artifactId>
        <version>5.1.38</version>
    </dependency>

    <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-streaming -->
    <dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming_2.11</artifactId>
        <version>2.3.0</version>
<!--        <scope>provided</scope>-->
    </dependency>

    </dependencies>

App

import java.util.Properties

import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, Row, SQLContext, SaveMode}
import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType}
import java.sql.{Connection, PreparedStatement}

object SparkSQL_Person_Schema {
  val connProps :Properties = new Properties()
  connProps.setProperty("user","root")
  connProps.setProperty("password","123456")
  val url = "jdbc:mysql://192.168.xx.xx:3306/sparktestresult"
  val conn :Connection = dbUtils.getConnection("mysql",url,connProps)

  def InsertData(iter:Iterator[Row]) = {
    try {
      iter.foreach(row =>{
        val valueseq: Seq[Any] = row.toSeq //迭代器遍历出来的是ROWRDD,这里转成 SEQ
        val id = valueseq(0);
        val name = valueseq(1);
        val age = valueseq(2);

        val statement: PreparedStatement = conn.prepareStatement("insert into t_person(id,name,age) value(?,?,?)")
        statement.setInt(1,id.asInstanceOf[Int])
        statement.setString(2,name.asInstanceOf[String])
        statement.setInt(3,age.asInstanceOf[Int])

        statement.executeUpdate()
        statement.close()
      })
    }catch{
      case ex:Exception  => //handle exception
    } finally {
    }

    }

  def main(args: Array[String]): Unit = {
    // HDFS开启了权限管理,设置用户名,避免HDFS读写权限拒绝
    System.setProperty("HADOOP_USER_NAME", "hadoop");

    // Spark初始化
    val conf = new SparkConf().setAppName("SQL-SCHEMA").setMaster("local[*]")
    val sc = new SparkContext(conf)
    val sqlsc = new SQLContext(sc);

    // 获取HDFS数据
    val lineRDD = sc.textFile(args(0)).map(_.split(","))

    val schema = StructType(List(
      StructField("id",IntegerType,true),
      StructField("name",StringType,true),
      StructField("age",IntegerType,true)))

    val rowRDD = lineRDD.map(x =>Row(x(0).toInt,x(1),x(2).toInt))
    val personDF: DataFrame = sqlsc.createDataFrame(rowRDD, schema)

    // 注册表
    personDF.registerTempTable("t_person")

    val sqlrs: DataFrame = sqlsc.sql("select * from t_person order by age desc limit 2")

    // 输出
    // 输出-HDFS JSON
    sqlrs.write.json(args(1));

/*
    // 输出-MYSQL DF 1
    sqlrs.write.format("jdbc")
      .option("url","jdbc:mysql://192.168.xx.xx:3306/sparktestresult")
      .option("dbtable","t_person")
      .option("user","root")
      .option("password","123456")
      .mode(SaveMode.Append).save()

    // 输出-MYSQL DF 2
    val connectionProperties = new Properties()
    connectionProperties.setProperty("user","root")
    connectionProperties.setProperty("password","123456")
    sqlrs.write.mode("append").jdbc("jdbc:mysql://192.168.xx.xx:3306/sparktestresult","t_person",connectionProperties)
*/
    // 输出-MYSQL RDD 3
    rowRDD.foreachPartition(InsertData)

    sc.stop()
  }
}

dbUtils

import java.sql.{Connection, DriverManager}
import java.util.Properties

package object dbUtils {
  // driver load
  def driverLoad(dbType:String) = {
    try {
        dbType match{
        case "mysql" => Class.forName("com.mysql.jdbc.Driver")
        case "db2" | "oracle" | _  => "pending"
      }
    }catch{
      case ex:Exception  => // handle driver load exception
    }
  }
  // getconnection
  def getConnection(dbType:String,url:String,connProps:Properties):Connection = {
    this.driverLoad(dbType)
    DriverManager.getConnection(url,connProps.getProperty("user"),connProps.getProperty("password"))
  }

  // free connection
  def close(conn:Connection)= {
    try{
      if(conn != null || !conn.isClosed) conn.close
    }catch{
      case ex:Exception => // handle close exception
    }
  }
}

输入、输出结果

hadoop fs -cat hdfs://mycluster/person/data.csv
1,xiaoming,12
2,xiaoli,13
3,mayun,18

在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值