Java读取本地Parquet文件

概要

通过java读取parquet文件,读取文件时候有多种写法

  • 通过GenericRecord按行读取

ParquetReader reader = AvroParquetReader.builder(new AvroReadSupport(), parquetFilePath).build()

  • 通过Group按行组读取,也可以读取列信息

ParquetReader reader = AvroParquetReader.builder(new AvroReadSupport(), parquetFilePath).build()
ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(filePath), new Configuration()))

pom引入

<dependencies>
    <dependency>
        <groupId>org.apache.parquet</groupId>
        <artifactId>parquet-avro</artifactId>
        <version>1.12.0</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-common</artifactId>
        <version>3.3.1</version>
    </dependency>
    <dependency>
        <groupId>org.apache.hadoop</groupId>
        <artifactId>hadoop-client</artifactId>
        <version>3.3.1</version>
    </dependency>
</dependencies>

行式读取

  • 通过GenericRecord读取行数据
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericRecord;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.avro.AvroParquetReader;
import org.apache.parquet.avro.AvroReadSupport;
import org.apache.parquet.hadoop.ParquetReader;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class ParquetRecordReader {
    // 指定 Parquet 文件路径
    public static List<Map<String,Object>> readParquetFileWithRecord(String filePath) throws IOException {
        //  拼接parquet文件全路径
        Path parquetFilePath = new Path(filePath);
        ParquetReader<GenericRecord> reader = AvroParquetReader.builder(new AvroReadSupport(), parquetFilePath).build();
        GenericRecord record;
        List<Map<String,Object>> recordList = new ArrayList<>();
        //  开始遍历行数据
        while ((record = reader.read()) != null) {
            Map<String,Object> recordMap = new HashMap<>();
            Schema schema = record.getSchema();
            //  行的字段信息
            List<Schema.Field> fields = schema.getFields();
            GenericRecord finalRecord = record;
            fields.stream().forEach(item->{
                //  根据字段名称获取对应值
                String name = item.name();
                Object val = finalRecord.get(name);
                recordMap.put(name,val);
            });
            recordList.add(recordMap);
        }
        reader.close();
        return recordList;
    }

    public static void main(String[] args) throws IOException {
        String filePath = "D:\\parquet\\file" + File.separator+ "test1.parquet";
        readParquetFileWithRecord(filePath);
    }
}
  • 通过Group读取行数据
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.example.data.simple.SimpleGroup;
import org.apache.parquet.example.data.simple.convert.GroupRecordConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.util.HadoopInputFile;
import org.apache.parquet.io.ColumnIOFactory;
import org.apache.parquet.io.MessageColumnIO;
import org.apache.parquet.io.RecordReader;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.Type;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class ParquetGroupReader {
    public static List<Map<String,Object>> readParquetFileWithGroup(String filePath) throws IOException {
        List<Map<String,Object>> recordList = new ArrayList<>();
        //  读取parquet文件
        ParquetFileReader reader = ParquetFileReader.open(HadoopInputFile.fromPath(new Path(filePath), new Configuration()));
        //  根据文件头获取元数据信息
        MessageType schema = reader.getFooter().getFileMetaData().getSchema();
        List<Type> fields = schema.getFields();
        PageReadStore pages;
        //  遍历行组,一般就一个行组,数据量多了一个文件会有多个行组
        while ((pages = reader.readNextRowGroup()) != null) {
            long rows = pages.getRowCount();
            MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
            RecordReader recordReader = columnIO.getRecordReader(pages, new GroupRecordConverter(schema));
            //  遍历行
            for (int i = 0; i < rows; i++) {
                Map<String,Object> recordMap = new HashMap<>();
                SimpleGroup simpleGroup = (SimpleGroup) recordReader.read();
                //  遍历列获取数据数据
                fields.stream().forEach(item->{
                    final String name = item.getName();
                    //  根据列名称获取下标获取对应数据
                    final int fieldIndex = simpleGroup.getType().getFieldIndex(name);
                    final String valueToString = simpleGroup.getValueToString(fieldIndex, 0);
                    recordMap.put(name,valueToString);
                });
                recordList.add(recordMap);
            }
        }
        reader.close();
        return recordList;
    }
    public static void main(String[] args) throws IOException {
        String filePath = "D:\\parquet\\file" + File.separator+ "test1.parquet";
        readParquetFileWithGroup(filePath);
    }

列式读取

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.Version;
import org.apache.parquet.column.ColumnDescriptor;
import org.apache.parquet.column.ColumnReadStore;
import org.apache.parquet.column.ColumnReader;
import org.apache.parquet.column.impl.ColumnReadStoreImpl;
import org.apache.parquet.column.page.PageReadStore;
import org.apache.parquet.example.data.simple.convert.GroupRecordConverter;
import org.apache.parquet.format.converter.ParquetMetadataConverter;
import org.apache.parquet.hadoop.ParquetFileReader;
import org.apache.parquet.hadoop.metadata.ParquetMetadata;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import java.io.File;
import java.io.IOException;
import java.util.*;

public class ParquetColumnReader {
    public static Map<String, List<String>> readParquetFileWithColumn(String filePath) throws IOException {
        Map<String, List<String>> columnMap = new HashMap<>();
        Configuration conf = new Configuration();
        final Path path = new Path(filePath);
        ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);
        MessageType schema = readFooter.getFileMetaData().getSchema();
        ParquetFileReader r = new ParquetFileReader(conf, path, readFooter);
        //  遍历行组信息
        PageReadStore rowGroup = null;
        while (null != (rowGroup = r.readNextRowGroup())) {
            ColumnReader colReader = null;
            //  读取列信息
            ColumnReadStore colReadStore = new ColumnReadStoreImpl(rowGroup, new GroupRecordConverter(schema).getRootConverter(), schema,Version.FULL_VERSION);
            List<ColumnDescriptor> descriptorList = schema.getColumns();
            //遍历列
            for (ColumnDescriptor colDescriptor : descriptorList) {
                String[] columnNamePath = colDescriptor.getPath();
                //  列名称
                String columnName = Arrays.toString(columnNamePath);
                colReader = colReadStore.getColumnReader(colDescriptor);
                //  当前列的数据行数
                long totalValuesInColumnChunk = rowGroup.getPageReader(colDescriptor).getTotalValueCount();
                //获取列类型,根据列类型调用不同的方法获取数据
                PrimitiveType.PrimitiveTypeName type = colDescriptor.getType();
                final String name = type.name();
                List<String> columnList = new ArrayList<>();
                columnMap.put(columnName, columnList);
                //遍历列中每个元素
                for (int i = 0; i < totalValuesInColumnChunk; i++) {
                    String val = "";
                    if(name.equals("INT32")){
                        val = String.valueOf(colReader.getInteger());
                    }else if(name.equals("INT64")){
                        val = String.valueOf(colReader.getLong());
                    }
                    else{
                        val = colReader.getBinary().toStringUsingUTF8();
                    }
                    columnList.add(val);
                    colReader.consume();
                }
            }
        }
        r.close();
        return columnMap;
    }

    public static void main(String[] args) throws IOException {
        String filePath = "D:\\parquet\\file" + File.separator+ "test1.parquet";
        readParquetFileWithColumn(filePath);
    }
}
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
首先,需要导入相关的依赖: ```xml <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-core_2.11</artifactId> <version>2.4.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-sql_2.11</artifactId> <version>2.4.0</version> </dependency> ``` 然后,可以使用以下代码来读取 parquet 文件: ```java import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; public class ReadParquetFile { public static void main(String[] args) { // 创建 SparkSession SparkSession spark = SparkSession.builder() .appName("Read Parquet File") .master("local") .getOrCreate(); // 读取 parquet 文件 Dataset<Row> df = spark.read().parquet("path/to/parquet/file"); // 显示数据 df.show(); // 停止 SparkSession spark.stop(); } } ``` 其中,`spark.read().parquet("path/to/parquet/file")` 表示读取指定路径下的 parquet 文件。 接下来,可以使用以下代码将数据写入 parquet 文件: ```java import org.apache.spark.sql.Dataset; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; public class WriteParquetFile { public static void main(String[] args) { // 创建 SparkSession SparkSession spark = SparkSession.builder() .appName("Write Parquet File") .master("local") .getOrCreate(); // 创建数据 Dataset<Row> df = spark.read().csv("path/to/csv/file"); // 写入 parquet 文件 df.write().parquet("path/to/parquet/file"); // 停止 SparkSession spark.stop(); } } ``` 其中,`df.write().parquet("path/to/parquet/file")` 表示将 DataFrame 写入指定路径下的 parquet 文件。 需要注意的是,parquet 文件是二进制格式的,不能直接使用文本编辑器进行查看和修改。如果需要查看文件内容,可以使用 Spark 或者其他支持 parquet 格式的工具。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值