概要
通过java读取parquet文件,读取文件时候有多种写法
- 通过GenericRecord按行读取
ParquetReader reader = AvroParquetReader.builder(new AvroReadSupport(), parquetFilePath).build()
- 通过Group按行组读取,也可以读取列信息
ParquetReader reader = AvroParquetReader.builder(new AvroReadSupport(), parquetFilePath).build()
ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(filePath), new Configuration()))
pom引入
<dependencies>
<dependency>
<groupId>org.apache.parquet</groupId>
<artifactId>parquet-avro</artifactId>
<version>1.12.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.3.1</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>3.3.1</version>
</dependency>
</dependencies>
行式读取
- 通过GenericRecord读取行数据
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.hadoop.ParquetReader;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class ParquetRecordReader {
// 指定 Parquet 文件路径
public static List<Map<String,Object>> readParquetFileWithRecord(String filePath) throws IOException {
// 拼接parquet文件全路径
Path parquetFilePath = new Path(filePath);
ParquetReader<GenericRecord> reader = AvroParquetReader.builder(new AvroReadSupport(), parquetFilePath).build();
GenericRecord record;
List<Map<String,Object>> recordList = new ArrayList<>();
// 开始遍历行数据
while ((record = reader.read()) != null) {
Map<String,Object> recordMap = new HashMap<>();
Schema schema = record.getSchema();
// 行的字段信息
List<Schema.Field> fields = schema.getFields();
GenericRecord finalRecord = record;
fields.stream().forEach(item->{
// 根据字段名称获取对应值
String name = item.name();
Object val = finalRecord.get(name);
recordMap.put(name,val);
});
recordList.add(recordMap);
}
reader.close();
return recordList;
}
public static void main(String[] args) throws IOException {
String filePath = "D:\\parquet\\file" + File.separator+ "test1.parquet";
readParquetFileWithRecord(filePath);
}
}
- 通过Group读取行数据
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.example.data.simple.SimpleGroup;
import org.apache.parquet.example.data.simple.convert.GroupRecordConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.io.ColumnIOFactory;
import org.apache.parquet.io.MessageColumnIO;
import org.apache.parquet.io.RecordReader;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Type;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
public class ParquetGroupReader {
public static List<Map<String,Object>> readParquetFileWithGroup(String filePath) throws IOException {
List<Map<String,Object>> recordList = new ArrayList<>();
// 读取parquet文件
ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(filePath), new Configuration()));
// 根据文件头获取元数据信息
MessageType schema = reader.getFooter().getFileMetaData().getSchema();
List<Type> fields = schema.getFields();
PageReadStore pages;
// 遍历行组,一般就一个行组,数据量多了一个文件会有多个行组
while ((pages = reader.readNextRowGroup()) != null) {
long rows = pages.getRowCount();
MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
// 遍历行
for (int i = 0; i < rows; i++) {
Map<String,Object> recordMap = new HashMap<>();
SimpleGroup simpleGroup = (SimpleGroup) recordReader.read();
// 遍历列获取数据数据
fields.stream().forEach(item->{
final String name = item.getName();
// 根据列名称获取下标获取对应数据
final int fieldIndex = simpleGroup.getType().getFieldIndex(name);
final String valueToString = simpleGroup.getValueToString(fieldIndex, 0);
recordMap.put(name,valueToString);
});
recordList.add(recordMap);
}
}
reader.close();
return recordList;
}
public static void main(String[] args) throws IOException {
String filePath = "D:\\parquet\\file" + File.separator+ "test1.parquet";
readParquetFileWithGroup(filePath);
}
列式读取
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.Version;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReadStore;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.column.impl.ColumnReadStoreImpl;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.example.data.simple.convert.GroupRecordConverter;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import java.io.File;
import java.io.IOException;
import java.util.*;
public class ParquetColumnReader {
public static Map<String, List<String>> readParquetFileWithColumn(String filePath) throws IOException {
Map<String, List<String>> columnMap = new HashMap<>();
Configuration conf = new Configuration();
final Path path = new Path(filePath);
ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);
MessageType schema = readFooter.getFileMetaData().getSchema();
ParquetFileReader r = new ParquetFileReader(conf, path, readFooter);
// 遍历行组信息
PageReadStore rowGroup = null;
while (null != (rowGroup = r.readNextRowGroup())) {
ColumnReader colReader = null;
// 读取列信息
ColumnReadStore colReadStore = new ColumnReadStoreImpl(rowGroup, new GroupRecordConverter(schema).getRootConverter(), schema,Version.FULL_VERSION);
List<ColumnDescriptor> descriptorList = schema.getColumns();
//遍历列
for (ColumnDescriptor colDescriptor : descriptorList) {
String[] columnNamePath = colDescriptor.getPath();
// 列名称
String columnName = Arrays.toString(columnNamePath);
colReader = colReadStore.getColumnReader(colDescriptor);
// 当前列的数据行数
long totalValuesInColumnChunk = rowGroup.getPageReader(colDescriptor).getTotalValueCount();
//获取列类型,根据列类型调用不同的方法获取数据
PrimitiveType.PrimitiveTypeName type = colDescriptor.getType();
final String name = type.name();
List<String> columnList = new ArrayList<>();
columnMap.put(columnName, columnList);
//遍历列中每个元素
for (int i = 0; i < totalValuesInColumnChunk; i++) {
String val = "";
if(name.equals("INT32")){
val = String.valueOf(colReader.getInteger());
}else if(name.equals("INT64")){
val = String.valueOf(colReader.getLong());
}
else{
val = colReader.getBinary().toStringUsingUTF8();
}
columnList.add(val);
colReader.consume();
}
}
}
r.close();
return columnMap;
}
public static void main(String[] args) throws IOException {
String filePath = "D:\\parquet\\file" + File.separator+ "test1.parquet";
readParquetFileWithColumn(filePath);
}
}