在使用iceberg写数据时,一直弄不清楚为什么iceberg写入快,并且能够保证数据的一致性。今天决定搞清楚这个问题,经过查询和理解,写下来。
文件格式
iceberg元数据的文件目前有三个:metadata.json,snap.avro,m*.avro,它们分别对应了iceberg的元数据文件metadata,manifest-list,manifest-file文件,对于文件格式内容不明白的同学去查询资料看看,这里不做说明。
文件内容的格式如下
metadata.json
{
"format-version" : 2,
"table-uuid" : "da07b515-b2ed-458c-8fc1-fd2650d0f48a",
"location" : "s3a://wux-hoo-dev-01/ice_warehouse/p059_avenger/attr_vals",
"last-sequence-number" : 566,
"last-updated-ms" : 1715896148763,
"last-column-id" : 9,
"current-schema-id" : 0,
"schemas" : [ {
"type" : "struct",
"schema-id" : 0,
"fields" : [ {
"id" : 1,
"name" : "serial_num",
"required" : false,
"type" : "string"
}, {
"id" : 2,
"name" : "trans_seq",
"required" : false,
"type" : "string"
}, {
"id" : 3,
"name" : "attr_name",
"required" : false,
"type" : "string"
}, {
"id" : 4,
"name" : "pre_attr_value",
"required" : false,
"type" : "string"
}, {
"id" : 5,
"name" : "post_attr_value",
"required" : false,
"type" : "string"
}, {
"id" : 6,
"name" : "in_run_file",
"required" : false,
"type" : "string"
}, {
"id" : 7,
"name" : "event_date",
"required" : false,
"type" : "string"
}, {
"id" : 8,
"name" : "family",
"required" : false,
"type" : "string"
}, {
"id" : 9,
"name" : "operation",
"required" : false,
"type" : "string"
} ]
} ],
"default-spec-id" : 0,
"partition-specs" : [ {
"spec-id" : 0,
"fields" : [ {
"name" : "event_date",
"transform" : "identity",
"source-id" : 7,
"field-id" : 1000
}, {
"name" : "family",
"transform" : "identity",
"source-id" : 8,
"field-id" : 1001
}, {
"name" : "operation",
"transform" : "identity",
"source-id" : 9,
"field-id" : 1002
} ]
} ],
"last-partition-id" : 1002,
"default-sort-order-id" : 0,
"sort-orders" : [ {
"order-id" : 0,
"fields" : [ ]
} ],
"properties" : {
"owner" : "root",
"write.metadata.delete-after-commit.enabled" : "true",
"schema.name-mapping.default" : "[{ \"field-id\":1, \"names\": [\"serial_num\"] },{ \"field-id\":2, \"names\": [\"trans_seq\"] },{ \"field-id\":3, \"names\": [\"attr_name\"] },{ \"field-id\":4, \"names\": [\"pre_attr_value\"] },{ \"field-id\":5, \"names\": [\"post_attr_value\"] },{ \"field-id\":6, \"names\": [\"in_run_file\"] },{ \"field-id\":7, \"names\": [\"event_date\"] },{ \"field-id\":8, \"names\": [\"family\"] },{ \"field-id\":9, \"names\": [\"operation\"] }]",
"write.metadata.previous-versions-max" : "3",
"write.parquet.compression-codec" : "zstd"
},
"current-snapshot-id" : 4890912288753727730,
"refs" : {
"main" : {
"snapshot-id" : 4890912288753727730,
"type" : "branch"
}
},
"snapshots" : [ {
"sequence-number" : 566,
"snapshot-id" : 4890912288753727730,
"parent-snapshot-id" : 3610607578140763977,
"timestamp-ms" : 1715892795791,
"summary" : {
"operation" : "replace",
"snapshot.producer" : "OPTIMIZE",
"added-data-files" : "1",
"deleted-data-files" : "3",
"added-records" : "8728",
"deleted-records" : "8728",
"added-files-size" : "29130",
"removed-files-size" : "42114",
"changed-partition-count" : "1",
"total-records" : "1329523",
"total-files-size" : "2771968",
"total-data-files" : "39",
"total-delete-files" : "0",
"total-position-deletes" : "0",
"total-equality-deletes" : "0"
},
"manifest-list" : "s3a://wux-hoo-dev-01/ice_warehouse/p059_avenger/attr_vals/metadata/snap-4890912288753727730-1-2b4564d4-e38e-4b5e-bf85-2660fa400b35.avro",
"schema-id" : 0
} ],
"statistics" : [ ],
"partition-statistics" : [ ],
"snapshot-log" : [ {
"timestamp-ms" : 1715892795791,
"snapshot-id" : 4890912288753727730
} ],
"metadata-log" : [ {
"timestamp-ms" : 1715889404796,
"metadata-file" : "s3a://wux-hoo-dev-01/ice_warehouse/p059_avenger/attr_vals/metadata/00578-0268ca68-4e34-4717-9cb1-1540070cb374.metadata.json"
}, {
"timestamp-ms" : 1715892548263,
"metadata-file" : "s3a://wux-hoo-dev-01/ice_warehouse/p059_avenger/attr_vals/metadata/00579-8f364707-92b4-4c36-8b19-4080c5fd6e69.metadata.json"
}, {
"timestamp-ms" : 1715892795791,
"metadata-file" : "s3a://wux-hoo-dev-01/ice_warehouse/p059_avenger/attr_vals/metadata/00580-c9b10830-04e3-4bc8-b414-57c86eca7c0d.metadata.json"
} ]
}
snap.avro
{
"manifest_path":
"s3://datalake/db1/orders/metadata/62acb3d7-e992-4cbc-8e41-58809fcacb3e.avro",
"manifest_length": 6152,
"added_snapshot_id": 8333017788700497002,
"added_data_files_count": 1,
"added_rows_count": 1,
"deleted_rows_count": 0,
"partitions": {
"array": [ {
"contains_null": false,
"lower_bound": {
"bytes": "¹Ô\\\\u0006\\\\u0000"
},
"upper_bound": {
"bytes": "¹Ô\\\\u0006\\\\u0000"
}
} ]
}
}
m*.avro
{
"data_file" : {
"file_path" :
“s3://datalake/db1/orders/data/order_ts_hour=2023-03-07-08/0_0_0.parquet”,
"file_format" : "PARQUET",
"block_size_in_bytes" : 67108864,
"null_value_counts" : [],
"lower_bounds" : {
"array": [{
"key": 1,
"value": 123
}],
}
"upper_bounds" : {
"array": [{
"key": 1,
"value": 123
}],
},
}
}
写入过程
在写入数据时,iceberg会先写入数据文件data.parquet,然后写m*.avro,再写snap.avro,然后是metadata.json,最后更新iceberg数据库中iceberg_tables的metadata_location字段。如此则完成一次完整写入。
高并发一致性
那么在高并发的时候,同时操作某张表时是如何运作的呢?其实这里iceberg引用了乐观锁的方法。其实在写入数据的时候是有一个验证的操作的,即在准备写入时会检查原文件和带写入文件的差异,如果存在本次写入数据以外的数据,则代表有其他操作已经更新了原文件,那么本次写入会取消,并重试写入的步骤,知道验证完成,写入完成,如此就保证了数据的一致性,并且也解决了HMS锁的问题。