java读取parquet文件附带int32(date)和int96(timestamp)转换

最新推荐文章于 2024-05-26 00:15:00 发布

Danger_Life

最新推荐文章于 2024-05-26 00:15:00 发布

阅读量1.8k

点赞数 3

分类专栏：芝麻开花----java

本文链接：https://blog.csdn.net/weixin_44385419/article/details/115671140

版权

芝麻开花----java 专栏收录该内容

7 篇文章 0 订阅

订阅专栏

文件：

https://pan.baidu.com/s/1G3cMVqVMgVm1f2p7aVKhXw
3wj5

依赖：

 <dependency>
            <groupId>org.apache.parquet</groupId>
            <artifactId>parquet-hadoop</artifactId>
            <version>1.7.0</version>
        </dependency>

代码：

import com.nature.third.utils.StringUtils;
import freemarker.template.SimpleDate;
import org.apache.hadoop.fs.Path;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroup;
import org.apache.parquet.hadoop.ParquetReader;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.Type;

import java.io.IOException;
import java.text.ParseException;
import java.text.SimpleDateFormat;
import java.util.List;

public class ParquetUtil {

    public static void main(String[] args) throws IOException, ParseException {
        String path = "hdfs://192.168.123.123:9000/types2_1617865619036.parquet";
        path = "C:\\Users\\1112\\Desktop\\types2_1617865619036.parquet";
        String res= getColumn(path);

    }

    public static String getColumn(String inPath) throws IOException, ParseException {
        String res = "";
        if(StringUtils.isNotEmpty(inPath)){
            GroupReadSupport readSupport = new GroupReadSupport();
            ParquetReader.Builder<Group> reader= ParquetReader.builder(readSupport, new Path(inPath));
            ParquetReader<Group> build= null;
            build = reader.build();
            //读取内容，想要内容便利line即可，方式类似io
            Group line = build.read();
            List<Type> typeList = line.getType().getFields();
            String colName = "";
            String colType = "";
            StringBuilder sbu = new StringBuilder();
            StringBuilder sbuV = new StringBuilder();
            StringBuilder sbuNew = new StringBuilder();
            if(typeList!=null && typeList.size()>0){
                for(Type type : typeList){
                    colName = type.getName();
                    colType = getJavaType(type);
                    sbuV.append(colName).append("\t");
                    sbu.append("字段名："+colName).append("\t").append("字段类型："+colType).append("\n");
                }
                sbuNew.append("new_date").append("\t");
                sbuNew.append("new_time").append("\n");
                if(sbu.length()>0){
                    res = sbu.substring(0,sbu.length()-1);
                }
            }

            System.out.println(res);
            int size = typeList.size();
            String value = "";
            String new_time = "";
            String new_date = "";
            while (line!=null){
                sbuV.append("\n");
                sbuNew.append("\n");
                for (int i = 0; i < size; i++) {
                    value = ((SimpleGroup) line).getValueToString(i,0);
                    sbuV.append(value).append("\t");

                    Integer timeDay = ((SimpleGroup) line).getInteger("date",0);
                    Long time = timeDay*24*60*60*1000L;
                    SimpleDateFormat sdf1 = new SimpleDateFormat("yyyy-MM-dd");
                    new_date = sdf1.format(time);

                    Binary bin = ((SimpleGroup) line).getInt96("time",0);
                    if(bin!=null){
                        Long longTime = ParquetTimestampUtils.getTimestampMillis(bin);
                        SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
                        new_time = sdf.format(longTime);
                    }
                }
                sbuNew.append(new_date).append("\t");
                sbuNew.append(new_time);
                line = build.read();
            }
            System.out.println(sbuV.toString());
            System.out.println(sbuNew.toString());
        }
        return res;
    }

    private static String getJavaType(Type type){

        String stringType = "字符型";
        String dateType = "日期型";
        String intType = "整型";
        String doubleType = "浮点型";
        String textType = "文本型";

        String res = "";
        String schemaType = type.asPrimitiveType().getPrimitiveTypeName().name();
        if("int32".equalsIgnoreCase(schemaType)){
            OriginalType originalType = type.getOriginalType();
            if(originalType!=null && "date".equalsIgnoreCase(originalType.name())){
                schemaType = "date";
            }
        }

        switch (schemaType.toLowerCase()) {
            case "date":
                res = dateType;
                break;
            case "int96":
                res = dateType;
                break;

            case "int32"://字符串
                res = intType;
                break;
            case "int64"://字符串
                res = intType;
                break;

            case "float"://浮点型
                res = doubleType;
                break;
            case "double"://浮点型
                res = doubleType;
                break;

            case "boolean"://字符串
                res = stringType;
                break;
            case "binary"://字符串
                res = stringType;
                break;
            default:
                res = stringType;
        }
        return res;
    }
}

import java.util.concurrent.TimeUnit;

import org.apache.parquet.io.api.Binary;

import com.google.common.primitives.Ints;
import com.google.common.primitives.Longs;


public class ParquetTimestampUtils {
    /**
     * julian date的偏移量，2440588相当于1970/1/1
     */
    private static final int JULIAN_EPOCH_OFFSET_DAYS = 2440588;
    private static final long MILLIS_IN_DAY = TimeUnit.DAYS.toMillis(1);
    private static final long NANOS_PER_MILLISECOND = TimeUnit.MILLISECONDS.toNanos(1);

    private ParquetTimestampUtils() {}

    /**
     * Returns GMT timestamp from binary encoded parquet timestamp (12 bytes - julian date + time of day nanos).
     *
     * @param timestampBinary INT96 parquet timestamp
     * @return timestamp in millis, GMT timezone
     */
    public static long getTimestampMillis(Binary timestampBinary)
    {
        if (timestampBinary.length() != 12) {
            return 0;
//            throw new PrestoException(HIVE_BAD_DATA, "Parquet timestamp must be 12 bytes, actual " + timestampBinary.length());
        }
        byte[] bytes = timestampBinary.getBytes();

        // little endian encoding - need to invert byte order
        long timeOfDayNanos = Longs.fromBytes(bytes[7], bytes[6], bytes[5], bytes[4], bytes[3], bytes[2], bytes[1], bytes[0]);
        int julianDay = Ints.fromBytes(bytes[11], bytes[10], bytes[9], bytes[8]);

        return julianDayToMillis(julianDay) + (timeOfDayNanos / NANOS_PER_MILLISECOND);
    }

    private static long julianDayToMillis(int julianDay)
    {
        return (julianDay - JULIAN_EPOCH_OFFSET_DAYS) * MILLIS_IN_DAY;
    }
}

源文件数据：

在这里插入图片描述

输出结果

new_date 和 new_time 属于二次转换的结果

在这里插入图片描述

Danger_Life

关注

3
点赞
踩
8

收藏

觉得还不错? 一键收藏
1
评论
java读取parquet文件附带int32(date)和int96(timestamp)转换

文件：https://pan.baidu.com/s/1G3cMVqVMgVm1f2p7aVKhXw3wj5依赖： <dependency> <groupId>org.apache.parquet</groupId> <artifactId>parquet-hadoop</artifactId> <version>1.7.0</version>
复制链接

扫一扫