java读取hudi parquet文件
hudi版本说明:0.10.1
- pom引入
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.avro</groupId>
<artifactId>avro</artifactId>
<version>1.10.2</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>3.3.1</version>
</dependency>
- code
import org.apache.avro.Schema;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import java.io.IOException;
public class HudiParquetParser {
public static void main(String[] args) throws IOException {
// Hudi 生成的 Parquet 文件路径
String parquetFilePath = "/Users/lxq/Desktop/depark/test/parquet/user/45b734d2-bcff-406f-819f-2ba097356f10_0-2-0_20230703193743155.parquet";
// 读取 Parquet 文件
try (ParquetReader<org.apache.avro.generic.GenericRecord> reader = AvroParquetReader
.<org.apache.avro.generic.GenericRecord>builder(
HadoopInputFile.fromPath(new org.apache.hadoop.fs.Path(parquetFilePath), new org.apache.hadoop.conf.Configuration()))
.withConf(new org.apache.hadoop.conf.Configuration())
.build()) {
org.apache.avro.generic.GenericRecord record;
while ((record = reader.read()) != null) {
// 处理每个记录
Schema schema = record.getSchema();
for (Schema.Field field : schema.getFields()) {
String fieldName = field.name();
Object value = record.get(fieldName);
System.out.println(fieldName + ": " + value);
}
System.out.println("======================");
}
}
}
}