java存储 parq文件,列式存储格式之parquet读写

最新推荐文章于 2024-08-07 10:21:16 发布

weixin_39940425

最新推荐文章于 2024-08-07 10:21:16 发布

阅读量439

点赞数

文章标签： java存储 parq文件

ParquetFileReader读取，只需虚拟haddop列实体package com.kestrel;

/**

* @Auther: 12640

* @Date: 2021/1/1 15:13

* @Description:

public class TableHead {

/**

* 列名

private String name;

/**

* 存储列的数据类型

private String type;

/**

* 所在列

private Integer index;

public String getType() {

return type;

}

public void setType(String type) {

this.type = type;

}

public String getName() {

return name;

}

public void setName(String name) {

this.name = name;

}

public Integer getIndex() {

return index;

}

public void setIndex(Integer index) {

this.index = index;

}

2.parquet 实体类package com.kestrel;

import java.util.List;

/**

* @Auther: 12640

* @Date: 2021/1/1 15:14

* @Description:

public class TableResult {

/**

* 解析文件的表头信息暂时只对 arrow，csv 文件有效

private List< TableHead> columns;

/**

* 数据内容

private List> data;

public List< TableHead> getColumns() {

return columns;

}

public void setColumns(List< TableHead> columns) {

this.columns = columns;

}

public List> getData() {

return data;

}

public void setData(List> data) {

this.data = data;

}

3.读取parquet文件import com.fasterxml.jackson.databind.ObjectMapper;

import com.google.common.collect.Lists;

import org.apache.hadoop.conf.Configuration;

import org.apache.hadoop.fs.Path;

import org.apache.parquet.column.page.PageReadStore;

import org.apache.parquet.example.data.Group;

import org.apache.parquet.example.data.simple.convert.GroupRecordConverter;

import org.apache.parquet.format.converter.ParquetMetadataConverter;

import org.apache.parquet.hadoop.ParquetFileReader;

import org.apache.parquet.hadoop.ParquetReader;

import org.apache.parquet.hadoop.example.GroupReadSupport;

import org.apache.parquet.hadoop.metadata.ParquetMetadata;

import org.apache.parquet.io.ColumnIOFactory;

import org.apache.parquet.io.MessageColumnIO;

import org.apache.parquet.io.RecordReader;

import org.apache.parquet.schema.GroupType;

import org.apache.parquet.schema.MessageType;

import org.apache.parquet.schema.OriginalType;

import org.apache.parquet.schema.Type;

import java.io.File;

import java.io.IOException;

import java.util.ArrayList;

import java.util.List;

public class ReadParquet {

public static void main(String[] args) throws Exception {

TableResult tableResult = parquetReaderV2(new File("./tmp/demo.snappy.parquet"));

ObjectMapper mapper = new ObjectMapper();

String jsonString = mapper.writerWithDefaultPrettyPrinter()

.writeValueAsString(tableResult);

System.out.println(jsonString);

}

public static TableResult parquetReaderV2(File file) throws Exception {

long start = System.currentTimeMillis();

haddopEnv();

Path path = new Path(file.getAbsolutePath());

Configuration conf = new Configuration();

TableResult table = new TableResult();

//二位数据列表

List> dataList = Lists.newArrayList();

ParquetMetadata readFooter = ParquetFileReader.readFooter(conf, path, ParquetMetadataConverter.NO_FILTER);

MessageType schema = readFooter.getFileMetaData().getSchema();

ParquetFileReader r = new ParquetFileReader(conf, readFooter.getFileMetaData(), path, readFooter.getBlocks(), schema.getColumns());

// 1.9.0使用以下创建对象

// ParquetFileReader r = new ParquetFileReader(conf, path, readFooter);

PageReadStore pages = null;

try {

while (null != (pages = r.readNextRowGroup())) {

final long rows = pages.getRowCount();

// logger.info(file.getName()+" 行数: " + rows);

final MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);

final RecordReader recordReader = columnIO.getRecordReader(pages,

new GroupRecordConverter(schema));

for (int i = 0; i <= rows; i++) {

// System.out.println(recordReader.shouldSkipCurrentRecord());

final Group g = recordReader.read();

if (i == 0) {

// 设置表头列名

table.setColumns(parquetColumn(g));

i++;

}

// 获取行数据

List row = getparquetData(table.getColumns(), g);

dataList.add(row);

// printGroup(g);

}

} finally {

r.close();

}

// logger.info(file.getName()+" 加载时间:"+(System.currentTimeMillis() - start));

table.setData(dataList);

return table;

}

//新版本中new ParquetReader()所有构造方法好像都弃用了,用上面的builder去构造对象

static void parquetReader(String inPath) throws Exception{

GroupReadSupport readSupport = new GroupReadSupport();

ParquetReader reader = new ParquetReader(new Path(inPath),readSupport);

Group line=null;

while((line=reader.read())!=null){

System.out.println(line.toString());

}

System.out.println("读取结束");

}

private static List getparquetData(List columns, Group line) {

List row = new ArrayList<>();

Object cellStr = null;

for (int i = 0; i < columns.size(); i++) {

try {

switch (columns.get(i).getType()) {

case "DOUBLE":

cellStr = line.getDouble(i, 0);

break;

case "FLOAT":

cellStr = line.getFloat(i, 0);

break;

case "BOOLEAN":

cellStr = line.getBoolean(i, 0);

break;

case "INT96":

cellStr = line.getInt96(i, 0);

break;

case "LONG":

cellStr = line.getLong(i, 0);

break;

default:

cellStr = line.getValueToString(i, 0);

}

} catch (RuntimeException e) {

} finally {

row.add(cellStr);

}

return row;

}

/**

* 获取arrow 文件表头信息

* @param

* @return

private static List parquetColumn(Group line) {

List columns = Lists.newArrayList();

TableHead dto = null;

GroupType groupType = line.getType();

int fieldCount = groupType.getFieldCount();

for (int i = 0; i < fieldCount; i++) {

dto = new TableHead();

Type type = groupType.getType(i);

String fieldName = type.getName();

OriginalType originalType = type.getOriginalType();

String typeName = null;

if (originalType != null) {

typeName = originalType.name();

} else {

typeName = type.asPrimitiveType().getPrimitiveTypeName().name();

}

dto.setIndex(i);

dto.setName(fieldName);

dto.setType(typeName);

columns.add(dto);

}

return columns;

}

public static void haddopEnv() throws IOException {

File workaround = new File(".");

System.getProperties().put("hadoop.home.dir", workaround.getAbsolutePath());

new File("./bin").mkdirs();

new File("./bin/winutils.exe").createNewFile();

}

微信公众号【Java搬砖小伙子】关注一波，更多资源等着你哦

您的支持是我前进路上最大的动力，谢谢！

weixin_39940425

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫