sql/core/src/main/java/org/apache/spark/sql/execution/datasources/parquet/VectorizedParquetRecordReader.java
for (int i = 0; i < requestedSchema.getFieldCount(); ++i) {
Type t = requestedSchema.getFields().get(i);
if (!t.isPrimitive() || t.isRepetition(Type.Repetition.REPEATED)) {
throw new UnsupportedOperationException("Complex types not supported.");
}
isPrimitive 是parquet 里面的type 类型接口
Generate Complex Data
spark.range(2).select(col("id"), expr("1").as("kind"),
expr("array(1, 2)").as("arr_field"),
expr("array(array(1, 2), array(3, 4))").as("arr_arr_field"),
expr("array(struct(1, 2), struct(1, 2))").as("arr_struct_field"),
expr("array(map(1, 2), map(3,4))").as("arr_map_field"),
expr("struct(1, 2)").as("struct_field"),
expr("struct(1, struct(1, 2))").as("struct_struct_field"),
expr("struct(1, array(1, 2))").as("struct_array_field"),
expr("map(1, 2)").as("map_field"),
expr("map(1, map(3,4))").as("map_map_field"),
expr("map(1, array(1, 2))").as("map_arr_field"),
expr("map(struct(1, 2), 2)").as("map_struct_field"))
.coalesce(1)
.write
.format("parquet")
.mode("overwrite")
.parquet("lparquet")
spark.range(2).select(col("id"), expr("id % 2").as("kind"),
expr("array(1, 2)").as("arr_field"),
expr("struct(1, 2)").as("struct_field"))
.coalesce(1)
.write
.format("parquet")
.mode("overwrite")
.parquet("rparquet")
spark.catalog.createTable("ltab", "lparquet", "arrow")
spark.catalog.createTable("rtab", "rparquet", "arrow")
val dfr=spark.range(2).select(col("id"), expr("id % 2").as("kind"),
expr("array(array(1, 2), array(3, 4))").as("arr_arr_field"),
expr("array(struct(1, 2), struct(1, 2))").as("arr_struct_field"),
expr("array(map(1, 2), map(3,4))").as("arr_map_field"),
expr("struct(1, struct(1, 2))").as("struct_struct_field"),
expr("struct(1, array(1, 2))").as("struct_array_field"),
expr("map(1, map(3,4))").as("map_map_field"),
expr("map(1, array(1, 2))").as("map_arr_field"),
expr("map(struct(1, 2), 2)").as("map_struct_field"))
dfr.createOrReplaceTempView("rtab")
spark.sql("SELECT * from rtab").show
spark.sql("SELECT * from rtab").printSchema
OutPut
scala> spark.sql("SELECT * from rtab").show
+---+----+----------------+----------------+--------------------+-------------------+------------------+---------------+-------------+----------------+
| id|kind| arr_arr_field|arr_struct_field| arr_map_field|struct_struct_field|struct_array_field| map_map_field|map_arr_field|map_struct_field|
+---+----+----------------+----------------+--------------------+-------------------+------------------+---------------+-------------+----------------+
| 0| 0|[[1, 2], [3, 4]]|[{1, 2}, {1, 2}]|[{1 -> 2}, {3 -> 4}]| {1, {1, 2}}| {1, [1, 2]}|{1 -> {3 -> 4}}|{1 -> [1, 2]}| {{1, 2} -> 2}|
| 1| 1|[[1, 2], [3, 4]]|[{1, 2}, {1, 2}]|[{1 -> 2}, {3 -> 4}]| {1, {1, 2}}| {1, [1, 2]}|{1 -> {3 -> 4}}|{1 -> [1, 2]}| {{1, 2} -> 2}|
+---+----+----------------+----------------+--------------------+-------------------+------------------+---------------+-------------+----------------+
scala> spark.sql("SELECT * from rtab").printSchema
def printSchema(level: Int): Unit def printSchema(): Unit
scala> spark.sql("SELECT * from rtab").printSchema
root
|-- id: long (nullable = false)
|-- kind: long (nullable = true)
|-- arr_arr_field: array (nullable = false)
| |-- element: array (containsNull = false)
| | |-- element: integer (containsNull = false)
|-- arr_struct_field: array (nullable = false)
| |-- element: struct (containsNull = false)
| | |-- col1: integer (nullable = false)
| | |-- col2: integer (nullable = false)
|-- arr_map_field: array (nullable = false)
| |-- element: map (containsNull = false)
| | |-- key: integer
| | |-- value: integer (valueContainsNull = false)
|-- struct_struct_field: struct (nullable = false)
| |-- col1: integer (nullable = false)
| |-- col2: struct (nullable = false)
| | |-- col1: integer (nullable = false)
| | |-- col2: integer (nullable = false)
|-- struct_array_field: struct (nullable = false)
| |-- col1: integer (nullable = false)
| |-- col2: array (nullable = false)
| | |-- element: integer (containsNull = false)
|-- map_map_field: map (nullable = false)
| |-- key: integer
| |-- value: map (valueContainsNull = false)
| | |-- key: integer
| | |-- value: integer (valueContainsNull = false)
|-- map_arr_field: map (nullable = false)
| |-- key: integer
| |-- value: array (valueContainsNull = false)
| | |-- element: integer (containsNull = false)
|-- map_struct_field: map (nullable = false)
| |-- key: struct
| | |-- col1: integer (nullable = false)
| | |-- col2: integer (nullable = false)
| |-- value: integer (valueContainsNull = false)