1. 通过predicates设置读取并行度,如果只是spark.read.jdbc(mySqlHelper.url,mysql_table,predicates,mySqlHelper.prop),则并行的是1.
val ip = ""
val user = ""
val database = ""
val password = ""
val mySqlHelper = MySqlHelper(ip,database,user,password)
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName(this.getClass.getSimpleName).master("local[*]").enableHiveSupport().getOrCreate()
val mysql_table = ""
def f1(): Unit ={
val arr = ArrayBuffer[Int]()
for(i <- 0 until 100){
arr.append(i)
}
val predicates =arr.map(i=>{s"SHA1(fieldName)%100 = $i"}).toArray
val starttime = System.currentTimeMillis()
val a = spark.read.jdbc(mySqlHelper.url,mysql_table,predicates,mySqlHelper.prop)
println(a.rdd.getNumPartitions)
println(a.count())
//a.show(false)
val endtime = System.currentTimeMillis()
println(endtime-starttime)
}
def f2(): Unit ={
val starttime = System.currentTimeMillis()
val a = spark.read.jdbc(mySqlHelper.url,mysql_table,mySqlHelper.prop)
println(a.rdd.getNumPartitions)
a.show(false)
// println(a.count())
val endtime = System.currentTimeMillis()
println(endtime-starttime) //
}
spark.stop()
}
2. 测试结果:
数据量在1百万级别 两者读取速度没有明显的差别,在千万级别f1明显快的多的多