Java读取本地Parquet文件

概要

通过java读取parquet文件,读取文件时候有多种写法

  • 通过GenericRecord按行读取

ParquetReader reader = AvroParquetReader.builder(new AvroReadSupport(), parquetFilePath).build()

  • 通过Group按行组读取,也可以读取列信息

ParquetReader reader = AvroParquetReader.builder(new AvroReadSupport(), parquetFilePath).build()
ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(filePath), new Configuration()))

pom引入

<dependencies>
    <dependency>
        <groupId>org.apache.parquet</groupId>
        <artifactId>parquet-avro</artifactId>
        <version>1.12.0</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>3.3.1</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>3.3.1</version>
    </dependency>
</dependencies>

行式读取

  • 通过GenericRecord读取行数据
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.hadoop.ParquetReader;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class ParquetRecordReader {
    // 指定 Parquet 文件路径
    public static List<Map<String,Object>> readParquetFileWithRecord(String filePath) throws IOException {
        //  拼接parquet文件全路径
        Path parquetFilePath = new Path(filePath);
        ParquetReader<GenericRecord> reader = AvroParquetReader.builder(new AvroReadSupport(), parquetFilePath).build();
        GenericRecord record;
        List<Map<String,Object>> recordList = new ArrayList<>();
        //  开始遍历行数据
        while ((record = reader.read()) != null) {
            Map<String,Object> recordMap = new HashMap<>();
            Schema schema = record.getSchema();
            //  行的字段信息
            List<Schema.Field> fields = schema.getFields();
            GenericRecord finalRecord = record;
            fields.stream().forEach(item->{
                //  根据字段名称获取对应值
                String name = item.name();
                Object val = finalRecord.get(name);
                recordMap.put(name,val);
            });
            recordList.add(recordMap);
        }
        reader.close();
        return recordList;
    }

    public static void main(String[] args) throws IOException {
        String filePath = "D:\\parquet\\file" + File.separator+ "test1.parquet";
        readParquetFileWithRecord(filePath);
    }
}
  • 通过Group读取行数据
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.example.data.simple.SimpleGroup;
import org.apache.parquet.example.data.simple.convert.GroupRecordConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.io.ColumnIOFactory;
import org.apache.parquet.io.MessageColumnIO;
import org.apache.parquet.io.RecordReader;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Type;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class ParquetGroupReader {
    public static List<Map<String,Object>> readParquetFileWithGroup(String filePath) throws IOException {
        List<Map<String,Object>> recordList = new ArrayList<>();
        //  读取parquet文件
        ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(filePath), new Configuration()));
        //  根据文件头获取元数据信息
        MessageType schema = reader.getFooter().getFileMetaData().getSchema();
        List<Type> fields = schema.getFields();
        PageReadStore pages;
        //  遍历行组,一般就一个行组,数据量多了一个文件会有多个行组
        while ((pages = reader.readNextRowGroup()) != null) {
            long rows = pages.getRowCount();
            MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
            RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
            //  遍历行
            for (int i = 0; i < rows; i++) {
                Map<String,Object> recordMap = new HashMap<>();
                SimpleGroup simpleGroup = (SimpleGroup) recordReader.read();
                //  遍历列获取数据数据
                fields.stream().forEach(item->{
                    final String name = item.getName();
                    //  根据列名称获取下标获取对应数据
                    final int fieldIndex = simpleGroup.getType().getFieldIndex(name);
                    final String valueToString = simpleGroup.getValueToString(fieldIndex, 0);
                    recordMap.put(name,valueToString);
                });
                recordList.add(recordMap);
            }
        }
        reader.close();
        return recordList;
    }
    public static void main(String[] args) throws IOException {
        String filePath = "D:\\parquet\\file" + File.separator+ "test1.parquet";
        readParquetFileWithGroup(filePath);
    }

列式读取

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.Version;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReadStore;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.column.impl.ColumnReadStoreImpl;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.example.data.simple.convert.GroupRecordConverter;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import java.io.File;
import java.io.IOException;
import java.util.*;

public class ParquetColumnReader {
    public static Map<String, List<String>> readParquetFileWithColumn(String filePath) throws IOException {
        Map<String, List<String>> columnMap = new HashMap<>();
        Configuration conf = new Configuration();
        final Path path = new Path(filePath);
        ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);
        MessageType schema = readFooter.getFileMetaData().getSchema();
        ParquetFileReader r = new ParquetFileReader(conf, path, readFooter);
        //  遍历行组信息
        PageReadStore rowGroup = null;
        while (null != (rowGroup = r.readNextRowGroup())) {
            ColumnReader colReader = null;
            //  读取列信息
            ColumnReadStore colReadStore = new ColumnReadStoreImpl(rowGroup, new GroupRecordConverter(schema).getRootConverter(), schema,Version.FULL_VERSION);
            List<ColumnDescriptor> descriptorList = schema.getColumns();
            //遍历列
            for (ColumnDescriptor colDescriptor : descriptorList) {
                String[] columnNamePath = colDescriptor.getPath();
                //  列名称
                String columnName = Arrays.toString(columnNamePath);
                colReader = colReadStore.getColumnReader(colDescriptor);
                //  当前列的数据行数
                long totalValuesInColumnChunk = rowGroup.getPageReader(colDescriptor).getTotalValueCount();
                //获取列类型,根据列类型调用不同的方法获取数据
                PrimitiveType.PrimitiveTypeName type = colDescriptor.getType();
                final String name = type.name();
                List<String> columnList = new ArrayList<>();
                columnMap.put(columnName, columnList);
                //遍历列中每个元素
                for (int i = 0; i < totalValuesInColumnChunk; i++) {
                    String val = "";
                    if(name.equals("INT32")){
                        val = String.valueOf(colReader.getInteger());
                    }else if(name.equals("INT64")){
                        val = String.valueOf(colReader.getLong());
                    }
                    else{
                        val = colReader.getBinary().toStringUsingUTF8();
                    }
                    columnList.add(val);
                    colReader.consume();
                }
            }
        }
        r.close();
        return columnMap;
    }

    public static void main(String[] args) throws IOException {
        String filePath = "D:\\parquet\\file" + File.separator+ "test1.parquet";
        readParquetFileWithColumn(filePath);
    }
}
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
要在 Java 中创建 Parquet 文件,需要使用 Parquet 库和相关工具。以下是创建 Parquet 文件的示例代码: ```java import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.parquet.hadoop.ParquetWriter; import org.apache.parquet.hadoop.metadata.CompressionCodecName; import org.apache.parquet.example.data.Group; import org.apache.parquet.example.data.simple.SimpleGroupFactory; import org.apache.parquet.example.data.simple.convert.GroupRecordConverter; import org.apache.parquet.hadoop.example.GroupWriteSupport; import java.io.IOException; public class ParquetFileCreator { private static final String FILE_NAME = "example.parquet"; public static void main(String[] args) throws IOException { Configuration conf = new Configuration(); Path path = new Path(FILE_NAME); FileSystem fs = FileSystem.get(path.toUri(), conf); GroupWriteSupport writeSupport = new GroupWriteSupport(); writeSupport.setSchema(GroupRecordConverter.schema); SimpleGroupFactory groupFactory = new SimpleGroupFactory(GroupRecordConverter.schema); ParquetWriter<Group> writer = new ParquetWriter<>(path, writeSupport, CompressionCodecName.SNAPPY, 128 * 1024 * 1024, 1 * 1024 * 1024, 4 * 1024, true, false, ParquetProperties.WriterVersion.PARQUET_1_0, conf); Group group = groupFactory.newGroup() .append("id", 1) .append("name", "John") .append("age", 30); writer.write(group); writer.close(); fs.close(); } } ``` 此示例使用 Parquet 库和 Hadoop 文件系统 API。它创建一个名为 `example.parquet` 的 Parquet 文件,并将一个 `Group` 对象写入该文件。您可以根据需要自定义 `Group` 对象的结构。 请注意,此示例使用 `SNAPPY` 压缩编解码器。您可以选择其他压缩编解码器,如 `GZIP` 或 `LZO`。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值