直接上读取oracle的代码,
//读取Oracle数据
def readOracle(hiveContext: HiveContext, tableName : String /*,hzn_user : String,hznpassword : String*/): DataFrame ={
val url=props.getProperty("url")
val user=props.getProperty("user")
val password=props.getProperty("password")
val jdbcMap = Map("url" -> url,
"user" -> user,
"password" -> password,
"dbtable" -> tableName,
"driver" -> "oracle.jdbc.driver.OracleDriver")
val jdbcDF = hiveContext.read.options(jdbcMap).format("jdbc").load
jdbcDF
}
load方法点进去发现有个ResolvedDataSource类,看这个类的命名很明显是spark能处理的数据源
def load(): DataFrame = {
val resolved = ResolvedDataSource(
sqlContext,
userSpecifiedSchema = userSpecifiedSchema,
partitionColumns = Array.empty[String],
provider = source,
options = extraOptions.toMap)
DataFrame(sqlContext, LogicalRelation(resolved.relation))
}
继续往ResolvedDataSource深入,看apply方法
def apply(
sqlContext: SQLContext,
userSpecifiedSchema: Option[StructType],
partitionColumns: Array[String],
provider: String,
options: Map[String, String]): ResolvedDataSource = {
val clazz: Class[_] = lookupDataSource(provider)
def className: String = clazz.getCanonicalName
val relation = userSpecifiedSchema match {
case Some(schema: StructType) => clazz.newInstance() match {
case dataSource: SchemaRelationProvider =>
val caseInsensitiveOptions = new CaseInsensitiveMap(options)
if (caseInsensitiveOptions.contains("paths")) {
throw new AnalysisException(s"$className does not support paths option.")
}
dataSource.createRelation(sqlContext, caseInsensitiveOptions, schema)
case dataSource: HadoopFsRelationProvider =>
val maybePartitionsSchema = if (partitionColumns.isEmpty) {
None
} else {
Some(partitionColumnsSchema(
schema, partitionColumns, sqlContext.conf.caseSensitiveAnalysis))
}
val caseInsensitiveOptions = new CaseInsensitiveMap(options)
val paths = {
if (caseInsensitiveOptions.contains("paths") &&
caseInsensitiveOptions.contains("path")) {
throw new AnalysisException(s"Both path and paths options are present.")
}
caseInsensitiveOptions.get("paths")
.map(_.split("(?<!\\\\),").map(StringUtils.unEscapeString(_, '\\', ',')))
.getOrElse(Array(caseInsensitiveOptions("path")))
.flatMap{ pathString =>
val hdfsPath = new Path(pathString)
val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
SparkHadoopUtil.get.globPathIfNecessary(qualified).map(_.toString)
}
}
val dataSchema =
StructType(schema.filterNot(f => partitionColumns.contains(f.name))).asNullable
dataSource.createRelation(
sqlContext,
paths,
Some(dataSchema),
maybePartitionsSchema,
caseInsensitiveOptions)
case dataSource: org.apache.spark.sql.sources.RelationProvider =>
throw new AnalysisException(s"$className does not allow user-specified schemas.")
case _ =>
throw new AnalysisException(s"$className is not a RelationProvider.")
}
case None => clazz.newInstance() match {
case dataSource: RelationProvider =>
val caseInsensitiveOptions = new CaseInsensitiveMap(options)
if (caseInsensitiveOptions.contains("paths")) {
throw new AnalysisException(s"$className does not support paths option.")
}
dataSource.createRelation(sqlContext, caseInsensitiveOptions)
case dataSource: HadoopFsRelationProvider =>
val caseInsensitiveOptions = new CaseInsensitiveMap(options)
val paths = {
if (caseInsensitiveOptions.contains("paths") &&
caseInsensitiveOptions.contains("path")) {
throw new AnalysisException(s"Both path and paths options are present.")
}
caseInsensitiveOptions.get("paths")
.map(_.split("(?<!\\\\),").map(StringUtils.unEscapeString(_, '\\', ',')))
.getOrElse(Array(caseInsensitiveOptions("path")))
.flatMap{ pathString =>
val hdfsPath = new Path(pathString)
val fs = hdfsPath.getFileSystem(sqlContext.sparkContext.hadoopConfiguration)
val qualified = hdfsPath.makeQualified(fs.getUri, fs.getWorkingDirectory)
SparkHadoopUtil.get.globPathIfNecessary(qualified).map(_.toString)
}
}
dataSource.createRelation(sqlContext, paths, None, None, caseInsensitiveOptions)
case dataSource: org.apache.spark.sql.sources.SchemaRelationProvider =>
throw new AnalysisException(
s"A schema needs to be specified when using $className.")
case _ =>
throw new AnalysisException(
s"$className is neither a RelationProvider nor a FSBasedRelationProvider.")
}
}
new ResolvedDataSource(clazz, relation)
}
发现数据源有SchemaRelationProvider、HadoopFsRelationProvider、org.apache.spark.sql.sources.RelationProvider三中数据源类型。org.apache.spark.sql.sources.RelationProvider这个是关系型的数据源,让后继续往里点
@DeveloperApi
trait RelationProvider {
/**
* Returns a new base relation with the given parameters.
* Note: the parameters' keywords are case insensitive and this insensitivity is enforced
* by the Map that is passed to the function.
*/
def createRelation(sqlContext: SQLContext, parameters: Map[String, String]): BaseRelation
}
发现它是个特质,可以看到它有多个实现类
看包名可知,第一个是hbase,第二个是jdbc,第三、四是es的,显然我要的是第二个,继续往里点,好像发现新大陆了
class DefaultSource extends RelationProvider with DataSourceRegister {
override def shortName(): String = "jdbc"
/** Returns a new base relation with the given parameters. */
override def createRelation(
sqlContext: SQLContext,
parameters: Map[String, String]): BaseRelation = {
val url = parameters.getOrElse("url", sys.error("Option 'url' not specified"))
val table = parameters.getOrElse("dbtable", sys.error("Option 'dbtable' not specified"))
val partitionColumn = parameters.getOrElse("partitionColumn", null)
val lowerBound = parameters.getOrElse("lowerBound", null)
val upperBound = parameters.getOrElse("upperBound", null)
// 可以看出通过这个参数可以设置读取数据的分区数
val numPartitions = parameters.getOrElse("numPartitions", null)
if (partitionColumn != null
&& (lowerBound == null || upperBound == null || numPartitions == null)) {
sys.error("Partitioning incompletely specified")
}
val partitionInfo = if (partitionColumn == null) {
null
} else {
JDBCPartitioningInfo(
partitionColumn,
lowerBound.toLong,
upperBound.toLong,
numPartitions.toInt)
}
// 这个方法是计算jdbc的分区数
val parts = JDBCRelation.columnPartition(partitionInfo)
val properties = new Properties() // Additional properties that we will pass to getConnection
parameters.foreach(kv => properties.setProperty(kv._1, kv._2))
JDBCRelation(url, table, parts, properties)(sqlContext)
}
}
然后val parts = JDBCRelation.columnPartition(partitionInfo),往里点这个方法
def columnPartition(partitioning: JDBCPartitioningInfo): Array[Partition] = {
//默认值为0分区,所以默认分区数是0
if (partitioning == null) return Array[Partition](JDBCPartition(null, 0))
val numPartitions = partitioning.numPartitions
val column = partitioning.column
if (numPartitions == 1) return Array[Partition](JDBCPartition(null, 0))
// Overflow and silliness can happen if you subtract then divide.
// Here we get a little roundoff, but that's (hopefully) OK.
val stride: Long = (partitioning.upperBound / numPartitions
- partitioning.lowerBound / numPartitions)
var i: Int = 0
var currentValue: Long = partitioning.lowerBound
var ans = new ArrayBuffer[Partition]()
while (i < numPartitions) {
val lowerBound = if (i != 0) s"$column >= $currentValue" else null
currentValue += stride
val upperBound = if (i != numPartitions - 1) s"$column < $currentValue" else null
val whereClause =
if (upperBound == null) {
lowerBound
} else if (lowerBound == null) {
upperBound
} else {
s"$lowerBound AND $upperBound"
}
ans += JDBCPartition(whereClause, i)
i = i + 1
}
ans.toArray
}
}
所以如果没有设置numPartitions这个参数,默认分区为1个分区。