前言:
使用spark去读取JDBC的数据,有三种读取方式,如果数据量特别大的时候,达到了亿级的数据量,最好使用分区的方式读取.否则容易出现OOM异常.
文末再附pom依赖!
一:读取数据的三种方式
1.第一种方式(不指定分区)
1.jdbc源码参数
url:表示jdbc连接
table:表示读取的表名
properties:表示配置信息
def jdbc(url: String, table: String, properties: Properties):
2.代码
class demo03 {
//创建SparkSession入口
private val spark: SparkSession = SparkSession.builder().appName("mysql").master("local[4]").getOrCreate()
//导入隐士转换
import spark.implicits._
//设置连接JDBC的各种配置内容
val url = "jdbc:mysql://node03:3306/spark01"
val table = "student"
val user = "root"
val password = "123"
private val prop = new Properties()
prop.setProperty("user",user)
prop.setProperty("password",password)
@Test
//第一种读取方式
def first = {
spark.read
//指定读取格式的方式
.format("jdbc")
.jdbc(url,table,prop)
.show()
}
}
2.第二种方式(使用任意类型的列作为分区)
1.源码
def jdbc(
url: String,
table: String,
predicates: Array[String],
connectionProperties:
解释:
* @param url JDBC database url of the form `jdbc:subprotocol:subname`
* @param table Name of the table in the external database.
* @param predicates Condition in the where clause for each partition.
* @param connectionProperties JDBC database connection arguments, a list of arbitrary string
* tag/value. Normally at least a "user" and "password" property
* should be included. "fetchsize" can be used to control the
* number of rows per fetch.
2.代码
class demo03 {
//创建SparkSession入口
private val spark: SparkSession = SparkSession.builder().appName("mysql").master("local[4]").getOrCreate()
//导入隐士转换
import spark.implicits._
//设置连接JDBC的各种配置内容
val url = "jdbc:mysql://node03:3306/spark01"
val table = "student"
val user = "root"
val password = "123"
private val prop = new Properties()
prop.setProperty("user",user)
prop.setProperty("password",password)
@Test
//第二种方式读取(使用任意类型的列作为分区)
def second = {
//1.创建predicates,指定age字段和范围作为分区
val predicates = Array(
"age < 30",
"age >= 30 , age < 60 ",
"age >= 60"
)
spark.read
.format("jdbc")
.jdbc(url,table,predicates,prop)
.show()
}
}
3.第三种方式(适合大量数据读取)
1.源码
def jdbc(
url: String,
table: String,
columnName: String,
lowerBound: Long,
upperBound: Long,
numPartitions: Int,
connectionProperties: Properties):
解释:
* @param url JDBC database url of the form `jdbc:subprotocol:subname`.
* @param table Name of the table in the external database.
* @param columnName the name of a column of integral type that will be used for partitioning.
* @param lowerBound the minimum value of `columnName` used to decide partition stride.
*