Spark JDBC读取数据分区数源码跟踪

最新推荐文章于 2024-04-23 08:18:31 发布

斜杆小刘

最新推荐文章于 2024-04-23 08:18:31 发布

阅读量1k

点赞数 2

分类专栏： spark 文章标签： spark jdbc partition

本文链接：https://blog.csdn.net/yritssh/article/details/100104910

版权

spark 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

直接上读取oracle的代码，

  //读取Oracle数据
  def readOracle(hiveContext: HiveContext, tableName : String /*,hzn_user : String,hznpassword : String*/): DataFrame ={
    val url=props.getProperty("url")
    val user=props.getProperty("user")
    val password=props.getProperty("password")
    val jdbcMap = Map("url" -> url,
      "user" -> user,
      "password" -> password,
      "dbtable" -> tableName,
      "driver" -> "oracle.jdbc.driver.OracleDriver")
    val jdbcDF = hiveContext.read.options(jdbcMap).format("jdbc").load
    jdbcDF
  }

load方法点进去发现有个ResolvedDataSource类，看这个类的命名很明显是spark能处理的数据源

  def load(): DataFrame = {
    val resolved = ResolvedDataSource(
      sqlContext,
      userSpecifiedSchema = userSpecifiedSchema,
      partitionColumns = Array.empty[String],
      provider = source,
      options = extraOptions.toMap)
    DataFrame(sqlContext, LogicalRelation(resolved.relation))
  }

继续往ResolvedDataSource深入，看apply方法

 def apply(
      sqlContext: SQLContext,
      userSpecifiedSchema: Option[StructType],
      partitionColumns: Array[String],
      provider: String,
      options: Map[String, String]): ResolvedDataSource = {
    val clazz: Class[_] = lookupDataSource(provider)
    def className: String = clazz.getCanonicalName
    val relation = userSpecifiedSchema match {
      case Some(schema: StructType) => clazz.newInstance() match {
        case dataSource: SchemaRelationProvider =>
          val caseInsensitiveOptions = new CaseInsensitiveMap(options)
          if (caseInsensitiveOptions.contains("paths")) {
            throw new AnalysisException(s"$className does not support paths option.")
          }
          dataSource.createRelation(sqlContext, caseInsensitiveOptions, schema)
        case dataSource: HadoopFsRelationProvider =>
          val maybePartitionsSchema = if (partitionColumns.isEmpty) {
            None
          } else {
            Some(partitionColumnsSchema(
              schema, partitionColumns, sqlContext.conf.caseSensitiveAnalysis))
          }

          val caseInsensitiveOptions = new CaseInsensitiveMap(options)
          val paths = {
            if (caseInsensitiveOptions.contains("paths") &&
              caseInsensitiveOptions.contains("path")) {
              throw new AnalysisException(s"Both path and paths options are present.")
            }
            caseInsensitiveOptions.get("paths")
              .map(_.split("(?<!\\\\),").map(StringUtils.unEscapeString(_, '\\', ',')))
              .getOrElse(Array(caseInsensitiveOptions("path")))
              .flatMap{ pathString =>
                val hdfsPath = new Path(pathString)
                val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
                val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
                SparkHadoopUtil.get.globPathIfNecessary(qualified).map(_.toString)
              }
          }

          val dataSchema =
            StructType(schema.filterNot(f => partitionColumns.contains(f.name))).asNullable

          dataSource.createRelation(
            sqlContext,
            paths,
            Some(dataSchema),
            maybePartitionsSchema,
            caseInsensitiveOptions)
        case dataSource: org.apache.spark.sql.sources.RelationProvider =>
          throw new AnalysisException(s"$className does not allow user-specified schemas.")
        case _ =>
          throw new AnalysisException(s"$className is not a RelationProvider.")
      }

      case None => clazz.newInstance() match {
        case dataSource: RelationProvider =>
          val caseInsensitiveOptions = new CaseInsensitiveMap(options)
          if (caseInsensitiveOptions.contains("paths")) {
            throw new AnalysisException(s"$className does not support paths option.")
          }
          dataSource.createRelation(sqlContext, caseInsensitiveOptions)
        case dataSource: HadoopFsRelationProvider =>
          val caseInsensitiveOptions = new CaseInsensitiveMap(options)
          val paths = {
            if (caseInsensitiveOptions.contains("paths") &&
              caseInsensitiveOptions.contains("path")) {
              throw new AnalysisException(s"Both path and paths options are present.")
            }
            caseInsensitiveOptions.get("paths")
              .map(_.split("(?<!\\\\),").map(StringUtils.unEscapeString(_, '\\', ',')))
              .getOrElse(Array(caseInsensitiveOptions("path")))
              .flatMap{ pathString =>
                val hdfsPath = new Path(pathString)
                val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
                val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
                SparkHadoopUtil.get.globPathIfNecessary(qualified).map(_.toString)
              }
          }
          dataSource.createRelation(sqlContext, paths, None, None, caseInsensitiveOptions)
        case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider =>
          throw new AnalysisException(
            s"A schema needs to be specified when using $className.")
        case _ =>
          throw new AnalysisException(
            s"$className is neither a RelationProvider nor a FSBasedRelationProvider.")
      }
    }
    new ResolvedDataSource(clazz, relation)
  }

发现数据源有SchemaRelationProvider、HadoopFsRelationProvider、org.apache.spark.sql.sources.RelationProvider三中数据源类型。org.apache.spark.sql.sources.RelationProvider这个是关系型的数据源，让后继续往里点

     @DeveloperApi
trait RelationProvider {
  /**
   * Returns a new base relation with the given parameters.
   * Note: the parameters' keywords are case insensitive and this insensitivity is enforced
   * by the Map that is passed to the function.
   */
  def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation
}

发现它是个特质，可以看到它有多个实现类

看包名可知，第一个是hbase，第二个是jdbc，第三、四是es的，显然我要的是第二个，继续往里点，好像发现新大陆了

class DefaultSource extends RelationProvider with DataSourceRegister {

  override def shortName(): String = "jdbc"

  /** Returns a new base relation with the given parameters. */
  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String]): BaseRelation = {
    val url = parameters.getOrElse("url", sys.error("Option 'url' not specified"))
    val table = parameters.getOrElse("dbtable", sys.error("Option 'dbtable' not specified"))
    val partitionColumn = parameters.getOrElse("partitionColumn", null)
    val lowerBound = parameters.getOrElse("lowerBound", null)
    val upperBound = parameters.getOrElse("upperBound", null)
	// 可以看出通过这个参数可以设置读取数据的分区数
    val numPartitions = parameters.getOrElse("numPartitions", null)

    if (partitionColumn != null
      && (lowerBound == null || upperBound == null || numPartitions == null)) {
      sys.error("Partitioning incompletely specified")
    }

    val partitionInfo = if (partitionColumn == null) {
      null
    } else {
      JDBCPartitioningInfo(
        partitionColumn,
        lowerBound.toLong,
        upperBound.toLong,
        numPartitions.toInt)
    }
	// 这个方法是计算jdbc的分区数
    val parts = JDBCRelation.columnPartition(partitionInfo)
    val properties = new Properties() // Additional properties that we will pass to getConnection
    parameters.foreach(kv => properties.setProperty(kv._1, kv._2))
    JDBCRelation(url, table, parts, properties)(sqlContext)
  }
}

然后val parts = JDBCRelation.columnPartition(partitionInfo)，往里点这个方法

 def columnPartition(partitioning: JDBCPartitioningInfo): Array[Partition] = {
//默认值为0分区，所以默认分区数是0
    if (partitioning == null) return Array[Partition](JDBCPartition(null, 0))

    val numPartitions = partitioning.numPartitions
    val column = partitioning.column
    if (numPartitions == 1) return Array[Partition](JDBCPartition(null, 0))
    // Overflow and silliness can happen if you subtract then divide.
    // Here we get a little roundoff, but that's (hopefully) OK.
    val stride: Long = (partitioning.upperBound / numPartitions
                      - partitioning.lowerBound / numPartitions)
    var i: Int = 0
    var currentValue: Long = partitioning.lowerBound
    var ans = new ArrayBuffer[Partition]()
    while (i < numPartitions) {
      val lowerBound = if (i != 0) s"$column >= $currentValue" else null
      currentValue += stride
      val upperBound = if (i != numPartitions - 1) s"$column < $currentValue" else null
      val whereClause =
        if (upperBound == null) {
          lowerBound
        } else if (lowerBound == null) {
          upperBound
        } else {
          s"$lowerBound AND $upperBound"
        }
      ans += JDBCPartition(whereClause, i)
      i = i + 1
    }
    ans.toArray
  }
}

所以如果没有设置numPartitions这个参数，默认分区为1个分区。

斜杆小刘

关注

2
点赞
踩
3

收藏

觉得还不错? 一键收藏
0
评论
Spark JDBC读取数据分区数源码跟踪

直接上读取oracle的代码， //读取Oracle数据 def readOracle(hiveContext: HiveContext, tableName : String /*,hzn_user : String,hznpassword : String*/): DataFrame ={ val url=props.getProperty("url") val u...
复制链接

扫一扫