Parquet文件格式定义和数据写入


前言

Parquet文件存储的时候是以列存储的,数据格式支持List,Map,Struct,Decimal、基础数据类型(PrimitiveTypeInfo)。

pom.xml文件引入

<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-mapreduce-client-core</artifactId>
    <version>3.2.0</version>
</dependency>
<dependency>
    <groupId>org.apache.hadoop</groupId>
    <artifactId>hadoop-mapreduce-client-jobclient</artifactId>
    <version>3.2.0</version>
</dependency>
<dependency>
    <groupId>org.apache.hive</groupId>
    <artifactId>hive-jdbc</artifactId>
    <version>3.1.2</version>
</dependency>
<dependency>
    <groupId>org.apache.hive</groupId>
    <artifactId>hive-exec</artifactId>
    <version>3.1.2</version>
</dependency>

一、定义Parquet数据格式

Parquet数据类型分为基础数据类型和Group数据类型

1. 基础数据类型如下

void、boolean、int、bigint、string、char、varchar、float、double、tinyint、smallint、date、timestamp、interval_year_month、interval_day_time、binary、decimal。

2. Group数据类型

List、Map、Struct

3. 定义Parquet数据类型代码

详细实现和数据类型见 private static Type convertType(final String name, final TypeInfo typeInfo, final Repetition repetition)方法

package com.study.spark.mr.utils;

import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.*;
import org.apache.parquet.schema.*;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type.Repetition;

import java.util.ArrayList;
import java.util.List;

/**
 * 定义Parquet Schema格式
 * GroupType是Type的子类
 */
public class ParquetDataSchema {

    public static MessageType convert(final List<String> columnNames, final List<TypeInfo> columnTypes) {
        final MessageType schema = new MessageType("hive_schema", convertTypes(columnNames, columnTypes));
        return schema;
    }

    private static Type[] convertTypes(final List<String> columnNames, final List<TypeInfo> columnTypes) {
        if (columnNames.size() != columnTypes.size()) {
            throw new IllegalStateException("Mismatched Hive columns and types. Hive columns names" +
                    " found : " + columnNames + " . And Hive types found : " + columnTypes);
        }
        final Type[] types = new Type[columnNames.size()];
        for (int i = 0; i < columnNames.size(); ++i) {
            types[i] = convertType(columnNames.get(i), columnTypes.get(i));
        }
        return types;
    }

    private static Type convertType(final String name, final TypeInfo typeInfo) {
        return convertType(name, typeInfo, Repetition.OPTIONAL);
    }

    private static Type convertType(final String name, final TypeInfo typeInfo, final Repetition repetition) {
        //是否是基础数据类型
        if (typeInfo.getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
            if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) {
                return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8)
                        .named(name);
            } else if (typeInfo.equals(TypeInfoFactory.intTypeInfo)) {
                return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name);
            } else if (typeInfo.equals(TypeInfoFactory.shortTypeInfo)) {
                return Types.primitive(PrimitiveTypeName.INT32, repetition)
                        .as(OriginalType.INT_16).named(name);
            } else if (typeInfo.equals(TypeInfoFactory.byteTypeInfo)) {
                return Types.primitive(PrimitiveTypeName.INT32, repetition)
                        .as(OriginalType.INT_8).named(name);
            } else if (typeInfo.equals(TypeInfoFactory.longTypeInfo)) {
                return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name);
            } else if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) {
                return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name);
            } else if (typeInfo.equals(TypeInfoFactory.floatTypeInfo)) {
                return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name);
            } else if (typeInfo.equals(TypeInfoFactory.booleanTypeInfo)) {
                return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name);
            } else if (typeInfo.equals(TypeInfoFactory.binaryTypeInfo)) {
                return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name);
            } else if (typeInfo.equals(TypeInfoFactory.timestampTypeInfo)) {
                return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name);
            } else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) {
                throw new UnsupportedOperationException("Void type not implemented");
            } else if (typeInfo.getTypeName().toLowerCase().startsWith(
                    serdeConstants.CHAR_TYPE_NAME)) {
                return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
                        .named(name);
            } else if (typeInfo.getTypeName().toLowerCase().startsWith(
                    serdeConstants.VARCHAR_TYPE_NAME)) {
                return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
                        .named(name);
            } else if (typeInfo instanceof DecimalTypeInfo) {
                DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo;
                int prec = decimalTypeInfo.precision();
                int scale = decimalTypeInfo.scale();
                int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1];
                return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).
                        scale(scale).precision(prec).named(name);
            } else if (typeInfo.equals(TypeInfoFactory.dateTypeInfo)) {
                return Types.primitive(PrimitiveTypeName.INT32, repetition).as(OriginalType.DATE).named
                        (name);
            } else if (typeInfo.equals(TypeInfoFactory.unknownTypeInfo)) {
                throw new UnsupportedOperationException("Unknown type not implemented");
            } else {
                throw new IllegalArgumentException("Unknown type: " + typeInfo);
            }
        } else if (typeInfo.getCategory().equals(ObjectInspector.Category.LIST)) {
            return convertArrayType(name, (ListTypeInfo) typeInfo);
        } else if (typeInfo.getCategory().equals(ObjectInspector.Category.STRUCT)) {
            return convertStructType(name, (StructTypeInfo) typeInfo);
        } else if (typeInfo.getCategory().equals(ObjectInspector.Category.MAP)) {
            return convertMapType(name, (MapTypeInfo) typeInfo);
        } else if (typeInfo.getCategory().equals(ObjectInspector.Category.UNION)) {
            throw new UnsupportedOperationException("Union type not implemented");
        } else {
            throw new IllegalArgumentException("Unknown type: " + typeInfo);
        }
    }

    // An optional group containing a repeated anonymous group "bag", containing
    // 1 anonymous element "array_element"
    @SuppressWarnings("deprecation")
    private static GroupType convertArrayType(final String name, final ListTypeInfo typeInfo) {
        final TypeInfo subType = typeInfo.getListElementTypeInfo();
        return new GroupType(Repetition.OPTIONAL, name, OriginalType.LIST, new GroupType(Repetition.REPEATED,
                ParquetHiveSerDe.ARRAY.toString(), convertType("array_element", subType)));
    }

    // An optional group containing multiple elements
    private static GroupType convertStructType(final String name, final StructTypeInfo typeInfo) {
        final List<String> columnNames = typeInfo.getAllStructFieldNames();
        final List<TypeInfo> columnTypes = typeInfo.getAllStructFieldTypeInfos();
        return new GroupType(Repetition.OPTIONAL, name, convertTypes(columnNames, columnTypes));

    }

    // An optional group containing a repeated anonymous group "map", containing
    // 2 elements: "key", "value"
    private static GroupType convertMapType(final String name, final MapTypeInfo typeInfo) {
        final Type keyType = convertType(ParquetHiveSerDe.MAP_KEY.toString(),
                typeInfo.getMapKeyTypeInfo(), Repetition.REQUIRED);
        final Type valueType = convertType(ParquetHiveSerDe.MAP_VALUE.toString(),
                typeInfo.getMapValueTypeInfo());
        return ConversionPatterns.mapType(Repetition.OPTIONAL, name, keyType, valueType);
    }
}

4. 测试示例

示例代码

package com.study.spark.mr.utils;

import org.apache.hadoop.hive.serde2.typeinfo.*;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import org.apache.spark.sql.hive.orc.OrcOutputWriter;

import java.util.ArrayList;
import java.util.List;

public class ParquetSchemaTest {

    public static void main(String[] args){
        ParquetSchemaTest test = new ParquetSchemaTest();

        test.messageType();
    }
    
    public void messageType(){
        MessageType messageType = parquetSchema();
        System.out.println("schema string = "+messageType.toString());
        println(messageType.getFields());

    }

    public void println(List<Type> types){
        for (Type type : types){
            System.out.println("type name = "+ type.getName());
            System.out.println("repetition type = "+ type.getRepetition().name());
            if(type.getOriginalType() != null){
                System.out.println("original type = " + type.getOriginalType().name());
            }
            //是否基础类型
           boolean primitive =  type.isPrimitive();
           if(primitive){
               System.out.println("primitive type name = "+type.asPrimitiveType().getName());
           }else {
              GroupType groupType = type.asGroupType();

              println(groupType.getFields());
           }
        }
    }

    public MessageType parquetSchema(){
        List<String> columnNames = new ArrayList<>();
        List<TypeInfo> columnTypes = new ArrayList<>();
        columnNames.add("name");
        columnTypes.add(TypeInfoFactory.stringTypeInfo);

        columnNames.add("age");
        columnTypes.add(TypeInfoFactory.intTypeInfo);

        columnNames.add("toatal");
        columnTypes.add(new DecimalTypeInfo(22,2));

        StructTypeInfo structTypeInfo = new StructTypeInfo();
        structTypeInfo.setAllStructFieldNames(new ArrayList<>(columnNames));
        structTypeInfo.setAllStructFieldTypeInfos(new ArrayList<>(columnTypes));
        columnNames.add("struct_test");
        columnTypes.add(structTypeInfo);

        MapTypeInfo mapTypeInfo = new MapTypeInfo();
        mapTypeInfo.setMapKeyTypeInfo(TypeInfoFactory.stringTypeInfo);
        mapTypeInfo.setMapValueTypeInfo(TypeInfoFactory.floatTypeInfo);
        columnNames.add("map_test");
        columnTypes.add(mapTypeInfo);

        ListTypeInfo listTypeInfo = new ListTypeInfo();
        listTypeInfo.setListElementTypeInfo(TypeInfoFactory.stringTypeInfo);
        columnNames.add("list_test");
        columnTypes.add(listTypeInfo);

        MessageType messageType = ParquetDataSchema.convert(columnNames,columnTypes);
        return messageType;
    }
}

测试结果

schema string = message hive_schema {
  optional binary name (UTF8);
  optional int32 age;
  optional fixed_len_byte_array(10) toatal (DECIMAL(22,2));
  optional group struct_test {
    optional binary name (UTF8);
    optional int32 age;
    optional fixed_len_byte_array(10) toatal (DECIMAL(22,2));
  }
  optional group map_test (MAP) {
    repeated group map (MAP_KEY_VALUE) {
      required binary key (UTF8);
      optional float value;
    }
  }
  optional group list_test (LIST) {
    repeated group bag {
      optional binary array_element (UTF8);
    }
  }
}

type name = name
repetition type = OPTIONAL
original type = UTF8
primitive type name = name
type name = age
repetition type = OPTIONAL
primitive type name = age
type name = toatal
repetition type = OPTIONAL
original type = DECIMAL
primitive type name = toatal
type name = struct_test
repetition type = OPTIONAL
type name = name
repetition type = OPTIONAL
original type = UTF8
primitive type name = name
type name = age
repetition type = OPTIONAL
primitive type name = age
type name = toatal
repetition type = OPTIONAL
original type = DECIMAL
primitive type name = toatal
type name = map_test
repetition type = OPTIONAL
original type = MAP
type name = map
repetition type = REPEATED
original type = MAP_KEY_VALUE
type name = key
repetition type = REQUIRED
original type = UTF8
primitive type name = key
type name = value
repetition type = OPTIONAL
primitive type name = value
type name = list_test
repetition type = OPTIONAL
original type = LIST
type name = bag
repetition type = REPEATED
type name = array_element
repetition type = OPTIONAL
original type = UTF8
primitive type name = array_element

二、将数据转为Parquet输出格式

1.转输出数据

package com.study.spark.mr.utils;

import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.MessageType;

import java.awt.*;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.Iterator;
import java.util.List;
import java.util.Map;

public class ParquetDataWrite {

    public static Boolean booleanDataWriter(Boolean val) {
        return val;
    }

    public static Integer byteDataWriter(byte val) {
        return new Integer(val);

    }

    public static Integer shortDataWriter(Short val) {
        return new Integer(val);
    }

    public static Integer intWriter(Integer val) {
        return val;
    }

    public static Long longWriter(Long val) {
        return val;
    }

    public static Float floatWriter(Float val) {
        return val;
    }

    public static Double doubleDataWriter(Double val) {
        return val;
    }

    public static Binary stringWriter(String val) {
        return Binary.fromString(val);
    }

    public static Binary varcharWriter(String val) {
        return Binary.fromString(val);
    }

    /**
     * 将byte[]数据转为Binary,用于写入
     */
    public static Binary binaryWrite(byte[] bytes) {
        return Binary.fromByteArray(bytes);
    }

    /**
     * 将时间戳Timestamp转为Binary,用于写入
     */
    public static Binary timestampWrite(Timestamp ts) {
        return NanoTimeUtils.getNanoTime(ts, false).toBinary();
    }

    /**
     * 将字符串Decimal数据转为Binary,用于写入使用
     *
     * @param val   数据值
     * @param prec  定义Decimal中的数据长度
     * @param scale 定义Decimal中小数点后面位数
     */
    public static Binary decimalWrite(String val, int prec, int scale) {
        HiveDecimal hiveDecimal = HiveDecimal.create(val);
        byte[] decimalBytes = hiveDecimal.bigIntegerBytesScaled(scale);

        // Estimated number of bytes needed.
        int precToBytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1];
        if (precToBytes == decimalBytes.length) {
            // No padding needed.
            return Binary.fromByteArray(decimalBytes);
        }

        byte[] tgt = new byte[precToBytes];
        if (hiveDecimal.signum() == -1) {
            // For negative number, initializing bits to 1
            for (int i = 0; i < precToBytes; i++) {
                tgt[i] |= 0xFF;
            }
        }

        System.arraycopy(decimalBytes, 0, tgt, precToBytes - decimalBytes.length, decimalBytes.length); // Padding leading zeroes/ones.
        return Binary.fromByteArray(tgt);
    }

    /**
     * 将Date数据类型转为Int
     */
    public static Integer dateWrite(Date date) {
        return Integer.valueOf(DateWritable.dateToDays(date));
    }

    /**
     * list 数据类型转为Group
     * @param group  主结构体
     * @param index 为当前数据在结构体中的位置,也可以传入字段名称
     * @param values  数组中的值,这里String只是示例,具体根据List立民安数据类型写入
     * @return
     */
    public static Group listWrite(Group group, int index,List<String> values){
        Group listGroup = group.addGroup(index);
        for(String v : values){
            Group bagGroup = listGroup.addGroup(0);
            bagGroup.add(0,v);
        }
        return group;
    }
    /**
     * map 数据类型转为Group
     * @param group  主结构体
     * @param index 为当前数据在结构体中的位置,也可以传入字段名称
     * @param values  map中Key和value只是示例,具体根据定义Map结构传入
     */
    public static Group mapWrite(Group group, int index, Map<String,String> values){
        Group mapGroup = group.addGroup(index);
        Iterator<String> iterable =  values.keySet().iterator();
        while (iterable.hasNext()){
            String key = iterable.next();
            String value = values.get(key);
           Group dataGroup =  mapGroup.addGroup(0);
           dataGroup.add("key",key);
           dataGroup.add("value",value);
        }
        return group;
    }

    /**
     * Struct 结构转为Group
     * @param group 主结构体
     * @param index 为当前数据在结构体中的位置,也可以传入字段名称
     * @param values 这里为示例,具体根据定义结构传入
     * @return
     */
    public static Group structWrite(Group group, int index,String[] values){
        Group structGroup =group.addGroup(index);
        for(int i = 0; i < values.length; i++){
            structGroup.add(i,values[i]);
        }
        return group;

    }

}


总结

  1. 在写入数据的时候Group数据类型中,传入参数只是一个示例。在这里面数据类型需要和定义的数据类型匹配。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值