概要
通过java读取parquet文件,读取文件时候有多种写法
- 通过GenericRecord按行读取
ParquetReader reader = AvroParquetReader.builder(new AvroReadSupport(), parquetFilePath).build()
- 通过Group按行组读取,也可以读取列信息
ParquetReader reader = AvroParquetReader.builder(new AvroReadSupport(), parquetFilePath).build()
ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(filePath), new Configuration()))
pom引入
<dependencies>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.1</version>
</dependency>
</dependencies>
行式读取
- 通过GenericRecord读取行数据
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.hadoop.ParquetReader;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class ParquetRecordReader {
// 指定 Parquet 文件路径
public static List<Map<String,Object>> readParquetFileWithRecord(String filePath) throws IOException {
// 拼接parquet文件全路径
Path parquetFilePath = new Path(filePath);
ParquetReader<GenericRecord> reader = AvroParquetReader.builder(new AvroReadSupport(), parquetFilePath).build();
GenericRecord record;
List<Map<String,Object>> recordList = new ArrayList<>();
// 开始遍历行数据
while ((record = reader.read()) != null) {
Map<String,Object> recordMap = new HashMap<>();
Schema schema = record.getSchema();
// 行的字段信息
List<Schema.Field> fields = schema.getFields();
GenericRecord finalRecord = record;
fields.stream