Links
#iceberg catalog
https://iceberg.apache.org/docs/latest/spark-configuration/
相关接口
/**
* (Scala-specific)
* Create a table from the given path based on a data source, a schema and a set of options.
* Then, returns the corresponding DataFrame.
*
* @param tableName is either a qualified or unqualified name that designates a table.
* If no database identifier is provided, it refers to a table in
* the current database.
* @since 2.0.0
*/
@deprecated("use createTable instead.", "2.2.0")
def createExternalTable(
tableName: String,
source: String,
schema: StructType,
options: Map[String, String]): DataFrame = {
createTable(tableName, source, schema, options)
}
/**
* (Scala-specific)
* Create a table based on the dataset in a data source, a schema and a set of options.
* Then, returns the corresponding DataFrame.
*
* @param tableName is either a qualified or unqualified name that designates a table.
* If no database identifier is provided, it refers to a table in
* the current database.
* @since 2.2.0
*/
def createTable(
tableName: String,
source: String,
schema: StructType,
options: Map[String, String]): DataFrame
hive metastore
The default implementation of the Hive metastore in Apache Spark uses Apache Derby for its database persistence. This is available with no configuration required but is limited to only one Spark session at any time for the purposes of metadata storage. This obviously makes it unsuitable for use in multi-user environments, such as when shared on a development team or used in Production.
org.apache.spark.sql.connector.catalog.Catalogs
/**
* Load and configure a catalog by name.
* <p>
* This loads, instantiates, and initializes the catalog plugin for each call; it does not cache
* or reuse instances.
*
* @param name a String catalog name
* @param conf a SQLConf
* @return an initialized CatalogPlugin
* @throws CatalogNotFoundException if the plugin class cannot be found
* @throws org.apache.spark.SparkException if the plugin class cannot be instantiated
*/
@throws[CatalogNotFoundException]
@throws[SparkException]
def load(name: String, conf: SQLConf): CatalogPlugin = {
val pluginClassName = try {
conf.getConfString("spark.sql.catalog." + name)
} catch {
case _: NoSuchElementException =>
throw QueryExecutionErrors.catalogPluginClassNotFoundError(name)
}
val loader = Utils.getContextOrSparkClassLoader
try {
val pluginClass = loader.loadClass(pluginClassName)
if (!classOf[CatalogPlugin].isAssignableFrom(pluginClass)) {
throw QueryExecutionErrors.catalogPluginClassNotImplementedError(name, pluginClassName)
}
val plugin = pluginClass.getDeclaredConstructor().newInstance().asInstanceOf[CatalogPlugin]
plugin.initialize(name, catalogOptions(name, conf))
plugin
} catch {
case e: ClassNotFoundException =>
throw QueryExecutionErrors.catalogPluginClassNotFoundForCatalogError(
name, pluginClassName, e)
case e: NoSuchMethodException =>
throw QueryExecutionErrors.catalogFailToFindPublicNoArgConstructorError(
name, pluginClassName, e)
case e: IllegalAccessException =>
throw QueryExecutionErrors.catalogFailToCallPublicNoArgConstructorError(
name, pluginClassName, e)
case e: InstantiationException =>
throw QueryExecutionErrors.cannotInstantiateAbstractCatalogPluginClassError(
name, pluginClassName, e)
case e: InvocationTargetException =>
throw QueryExecutionErrors.failedToInstantiateConstructorForCatalogError(
name, pluginClassName, e)
}
}
这里不仅catalog 反射成功, 还进行了初始化
plugin.initialize(name, catalogOptions(name, conf))
/**
* Extracts a named catalog's configuration from a SQLConf.
*
* @param name a catalog name
* @param conf a SQLConf
* @return a case insensitive string map of options starting with spark.sql.catalog.(name).
*/
private def catalogOptions(name: String, conf: SQLConf) = {
val prefix = Pattern.compile("^spark\\.sql\\.catalog\\." + name + "\\.(.+)")
val options = new util.HashMap[String, String]
conf.getAllConfs.foreach {
case (key, value) =>
val matcher = prefix.matcher(key)
if (matcher.matches && matcher.groupCount > 0) options.put(matcher.group(1), value)
}
new CaseInsensitiveStringMap(options)
}
enableHiveSupport 的作用
def enableHiveSupport(): Builder = synchronized {
if (hiveClassesArePresent) {
config(CATALOG_IMPLEMENTATION.key, "hive")
} else {
throw new IllegalArgumentException(
"Unable to instantiate SparkSession with Hive support because " +
"Hive classes are not found.")
}
}
val CATALOG_IMPLEMENTATION = buildStaticConf("spark.sql.catalogImplementation")
.internal()
.version("2.0.0")
.stringConf
.checkValues(Set("hive", "in-memory"))
.createWithDefault("in-memory")
private val HIVE_SESSION_STATE_BUILDER_CLASS_NAME =
"org.apache.spark.sql.hive.HiveSessionStateBuilder"
如果没有 enableHiveSupport,
以下建表语句会报错
sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive")
Exception in thread "main" org.apache.spark.sql.AnalysisException: Hive support is required to CREATE Hive TABLE (AS SELECT);
'CreateTable `default`.`src`, Ignore
开启 SparkSession 的 Hive 支持,
经过这一步配置, SparkSQL 才会把 SQL 语句当作 HiveSQL 来进行解析
spark-sql/spark-shell 默认是打开 spark.sql.catalogImplementation 为hive 的
scala>
scala> sql("set spark.sql.catalogImplementation").show
+--------------------+-----+
| key|value|
+--------------------+-----+
|spark.sql.catalog...| hive|
+--------------------+-----+
show catalogs
val V2_SESSION_CATALOG_IMPLEMENTATION =
buildConf(s"spark.sql.catalog.$SESSION_CATALOG_NAME")
.doc("A catalog implementation that will be used as the v2 interface to Spark's built-in " +
s"v1 catalog: $SESSION_CATALOG_NAME. This catalog shares its identifier namespace with " +
s"the $SESSION_CATALOG_NAME and must be consistent with it; for example, if a table can " +
s"be loaded by the $SESSION_CATALOG_NAME, this catalog must also return the table " +
s"metadata. To delegate operations to the $SESSION_CATALOG_NAME, implementations can " +
"extend 'CatalogExtension'.")
.version("3.0.0")
.stringConf
.createOptional
spark.conf.set("spark.sql.catalog.testcat", classOf[InMemoryCatalog].getName)
spark.conf.set("spark.sql.catalog.testpart", classOf[InMemoryPartitionTableCatalog].getName)
spark.conf.set(
"spark.sql.catalog.testcat_atomic", classOf[StagingInMemoryTableCatalog].getName)
spark.conf.set("spark.sql.catalog.testcat2", classOf[InMemoryCatalog].getName)
spark.conf.set(
V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[InMemoryTableSessionCatalog].getName)
spark-sql> show catalogs;
Time taken: 0.043 seconds
spark.sql.catalog.spark_catalog
引入 spark-sql_2.12-3.3.2-tests.jar 中的 org.apache.spark.sql.connector.InMemoryTableSessionCatalog 进行测试
./bin/spark-sql --master local[2] --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.connector.InMemoryTableSessionCatalog
spark-sql>
>
>
> show catalogs;
spark_catalog
Time taken: 1.425 seconds, Fetched 1 row(s)
spark.sql.catalog.odps
如此配置,就可以每次使用 use odps;
--conf spark.sql.catalog.odps=org.apache.spark.sql.execution.datasources.v2.odps.OdpsTableCatalog \
--conf spark.sql.extensions=org.apache.spark.sql.execution.datasources.v2.odps.extension.OdpsExtensions \
use 之后就可以 show 出来了。
spark-sql> show catalogs;
Time taken: 0.043 seconds
spark-sql>
> use odps;
Time taken: 0.234 seconds
spark-sql>
> show catalogs;
odps
Time taken: 0.026 seconds, Fetched 1 row(s)
如下配置,可以避免每次 使用 use 了。
val DEFAULT_CATALOG = buildConf("spark.sql.defaultCatalog")
.doc("Name of the default catalog. This will be the current catalog if users have not " +
"explicitly set the current catalog yet.")
.version("3.0.0")
.stringConf
.createWithDefault(SESSION_CATALOG_NAME)
--conf spark.sql.defaultCatalog=odps
“show catalogs ” result 分析
对于 spark.sql.catalog.spark_catalog 来讲, 配置下这个 key 就能show 出来。
但是对于其他的自定义的catalog key ,需要使用 use 命令后才能 show 出来。
原因是 spark_catalog对应的value是 createOptional,没有创建 createWithDefault, 所以我们需要显示配置。catalog 才能配置到。
show catalogs 的本质是将以下的
org.apache.spark.sql.connector.catalog.CatalogManager
hashmap 的数据结构 catalogs 进行 key 查询
private val catalogs = mutable.HashMap.empty[String, CatalogPlugin]
- 对于 spark_catalog 来说,
/**
* If the V2_SESSION_CATALOG config is specified, we try to instantiate the user-specified v2
* session catalog. Otherwise, return the default session catalog.
*
* This catalog is a v2 catalog that delegates to the v1 session catalog. it is used when the
* session catalog is responsible for an identifier, but the source requires the v2 catalog API.
* This happens when the source implementation extends the v2 TableProvider API and is not listed
* in the fallback configuration, spark.sql.sources.useV1SourceList
*/
private[sql] def v2SessionCatalog: CatalogPlugin = {
conf.getConf(SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION).map { _ =>
catalogs.getOrElseUpdate(SESSION_CATALOG_NAME, loadV2SessionCatalog())
}.getOrElse(defaultSessionCatalog)
}
这里如果 V2_SESSION_CATALOG_IMPLEMENTATION (spark.sql.catalog.spark_catalog 默认是 createOptional) 如果没有配置, 那么会走 “getOrElse(defaultSessionCatalog)” 路径,那么 catalogs 自然就为空。 就show 不出来。
如果配置了 V2_SESSION_CATALOG_IMPLEMENTATION, 自然就show 出来了。
- 对于其他catalog来说。
before {
spark.conf.set("spark.sql.catalog.testcat", classOf[InMemoryCatalog].getName)
def catalog(name: String): CatalogPlugin = synchronized {
if (name.equalsIgnoreCase(SESSION_CATALOG_NAME)) {
v2SessionCatalog
} else {
catalogs.getOrElseUpdate(name, Catalogs.load(name, conf))
}
}
需要使用 “use” 命令 才可以回填 catalogs
org.apache.spark.sql.connector.catalog.LookupCatalog.scala
def unapply(nameParts: Seq[String]): Some[(CatalogPlugin, Seq[String])] = {
assert(nameParts.nonEmpty)
try {
Some((catalogManager.catalog(nameParts.head), nameParts.tail))
} catch {
case _: CatalogNotFoundException =>
Some((currentCatalog, nameParts))
}
}
因为 name 是 “testcat”,直接走 else 路径,这样 catalogs 就有 key-value,自然能 show 出 result
org.apache.spark.sql.connector.catalog.CatalogManager
def catalog(name: String): CatalogPlugin = synchronized {
if (name.equalsIgnoreCase(SESSION_CATALOG_NAME)) {
v2SessionCatalog
} else {
catalogs.getOrElseUpdate(name, Catalogs.load(name, conf))
}
}
相关单元测试
test("Just verify ShowCatalogs") {
import org.apache.spark.sql.connector.InMemoryTableSessionCatalog
import org.apache.spark.sql.internal.SQLConf.V2_SESSION_CATALOG_IMPLEMENTATION
spark.conf.set(
V2_SESSION_CATALOG_IMPLEMENTATION.key, classOf[InMemoryTableSessionCatalog].getName)
val df = sql("SHOW CATALOGS")
df.show()
}