下面是一个使用SparkSQL从MySQL中读取数据,并转换成一个本地Map输出到控制台的示例程序
val spark = SparkSession
.builder()
.appName(this.getClass.getSimpleName.stripSuffix("$"))
.master("local[*]")
.config("spark.default.parallelism", 8)
.config("spark.sql.shuffle.partitions", 8)
.getOrCreate()
import spark.implicits._
val jdbcHost = "192.168.1.2"
val jdbcPort = 3306
val jdbcDbName = "test_db"
val jdbcTable = "cus_timezone_info"
val jdbcUrl = s"jdbc:mysql://${jdbcHost}:${jdbcPort}/${jdbcDbName}?serverTimezone=UTC&characterEncoding=UTF-8&autoReconnect=true&failOverReadOnly=false"
val jdbcDF = spark
.read
.format("jdbc")
.option("url", jdbcUrl)
.option("dbtable", jdbcTable)
.option("user", "root")
.option("password", "******")
//用户自定义schema,可以只指定部分字段,其余字段SparkSQL会自动映射
.option("customSchema", "status SHORT, sort_no INTEGER")
//dbtable 和 query 不可以同时指定
//.option("query", "select c1, c2 from t1")
.load()
//jdbcDF.printSchema()
jdbcDF.createOrReplaceTempView("v_data")
//GROUP BY是为了实现去重, 效率比DISTINCT高
val filteredDF = spark.sql(
"""
|SELECT
| company_id,
| trim(timezone_id) AS timezone_id
|FROM v_data
| WHERE status = 0
|GROUP BY company_id, trim(timezone_id)
|""".stripMargin)
//filteredDF.show(200, false)
val companyTimezoneIdMap = filteredDF.map(row => {
val company_id = row.getAs[String]("company_id")
val timezone_id = row.getAs[String]("timezone_id")
(company_id, timezone_id)
}).collect()
.toMap
companyTimezoneIdMap.foreach{
case (companyId, timezoneId) => println(s"companyId: ${companyId}, timezoneId: ${timezoneId}")
}
spark.close()