前言
Parquet文件存储的时候是以列存储的,数据格式支持List,Map,Struct,Decimal、基础数据类型(PrimitiveTypeInfo)。
pom.xml文件引入
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-core</artifactId>
<version>3.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-mapreduce-client-jobclient</artifactId>
<version>3.2.0</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-exec</artifactId>
<version>3.1.2</version>
</dependency>
一、定义Parquet数据格式
Parquet数据类型分为基础数据类型和Group数据类型
1. 基础数据类型如下
void、boolean、int、bigint、string、char、varchar、float、double、tinyint、smallint、date、timestamp、interval_year_month、interval_day_time、binary、decimal。
2. Group数据类型
List、Map、Struct
3. 定义Parquet数据类型代码
详细实现和数据类型见 private static Type convertType(final String name, final TypeInfo typeInfo, final Repetition repetition)方法
package com.study.spark.mr.utils;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.apache.hadoop.hive.serde.serdeConstants;
import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector;
import org.apache.hadoop.hive.serde2.typeinfo.*;
import org.apache.parquet.schema.*;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Type.Repetition;
import java.util.ArrayList;
import java.util.List;
/**
* 定义Parquet Schema格式
* GroupType是Type的子类
*/
public class ParquetDataSchema {
public static MessageType convert(final List<String> columnNames, final List<TypeInfo> columnTypes) {
final MessageType schema = new MessageType("hive_schema", convertTypes(columnNames, columnTypes));
return schema;
}
private static Type[] convertTypes(final List<String> columnNames, final List<TypeInfo> columnTypes) {
if (columnNames.size() != columnTypes.size()) {
throw new IllegalStateException("Mismatched Hive columns and types. Hive columns names" +
" found : " + columnNames + " . And Hive types found : " + columnTypes);
}
final Type[] types = new Type[columnNames.size()];
for (int i = 0; i < columnNames.size(); ++i) {
types[i] = convertType(columnNames.get(i), columnTypes.get(i));
}
return types;
}
private static Type convertType(final String name, final TypeInfo typeInfo) {
return convertType(name, typeInfo, Repetition.OPTIONAL);
}
private static Type convertType(final String name, final TypeInfo typeInfo, final Repetition repetition) {
//是否是基础数据类型
if (typeInfo.getCategory().equals(ObjectInspector.Category.PRIMITIVE)) {
if (typeInfo.equals(TypeInfoFactory.stringTypeInfo)) {
return Types.primitive(PrimitiveTypeName.BINARY, repetition).as(OriginalType.UTF8)
.named(name);
} else if (typeInfo.equals(TypeInfoFactory.intTypeInfo)) {
return Types.primitive(PrimitiveTypeName.INT32, repetition).named(name);
} else if (typeInfo.equals(TypeInfoFactory.shortTypeInfo)) {
return Types.primitive(PrimitiveTypeName.INT32, repetition)
.as(OriginalType.INT_16).named(name);
} else if (typeInfo.equals(TypeInfoFactory.byteTypeInfo)) {
return Types.primitive(PrimitiveTypeName.INT32, repetition)
.as(OriginalType.INT_8).named(name);
} else if (typeInfo.equals(TypeInfoFactory.longTypeInfo)) {
return Types.primitive(PrimitiveTypeName.INT64, repetition).named(name);
} else if (typeInfo.equals(TypeInfoFactory.doubleTypeInfo)) {
return Types.primitive(PrimitiveTypeName.DOUBLE, repetition).named(name);
} else if (typeInfo.equals(TypeInfoFactory.floatTypeInfo)) {
return Types.primitive(PrimitiveTypeName.FLOAT, repetition).named(name);
} else if (typeInfo.equals(TypeInfoFactory.booleanTypeInfo)) {
return Types.primitive(PrimitiveTypeName.BOOLEAN, repetition).named(name);
} else if (typeInfo.equals(TypeInfoFactory.binaryTypeInfo)) {
return Types.primitive(PrimitiveTypeName.BINARY, repetition).named(name);
} else if (typeInfo.equals(TypeInfoFactory.timestampTypeInfo)) {
return Types.primitive(PrimitiveTypeName.INT96, repetition).named(name);
} else if (typeInfo.equals(TypeInfoFactory.voidTypeInfo)) {
throw new UnsupportedOperationException("Void type not implemented");
} else if (typeInfo.getTypeName().toLowerCase().startsWith(
serdeConstants.CHAR_TYPE_NAME)) {
return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
.named(name);
} else if (typeInfo.getTypeName().toLowerCase().startsWith(
serdeConstants.VARCHAR_TYPE_NAME)) {
return Types.optional(PrimitiveTypeName.BINARY).as(OriginalType.UTF8)
.named(name);
} else if (typeInfo instanceof DecimalTypeInfo) {
DecimalTypeInfo decimalTypeInfo = (DecimalTypeInfo) typeInfo;
int prec = decimalTypeInfo.precision();
int scale = decimalTypeInfo.scale();
int bytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1];
return Types.optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(bytes).as(OriginalType.DECIMAL).
scale(scale).precision(prec).named(name);
} else if (typeInfo.equals(TypeInfoFactory.dateTypeInfo)) {
return Types.primitive(PrimitiveTypeName.INT32, repetition).as(OriginalType.DATE).named
(name);
} else if (typeInfo.equals(TypeInfoFactory.unknownTypeInfo)) {
throw new UnsupportedOperationException("Unknown type not implemented");
} else {
throw new IllegalArgumentException("Unknown type: " + typeInfo);
}
} else if (typeInfo.getCategory().equals(ObjectInspector.Category.LIST)) {
return convertArrayType(name, (ListTypeInfo) typeInfo);
} else if (typeInfo.getCategory().equals(ObjectInspector.Category.STRUCT)) {
return convertStructType(name, (StructTypeInfo) typeInfo);
} else if (typeInfo.getCategory().equals(ObjectInspector.Category.MAP)) {
return convertMapType(name, (MapTypeInfo) typeInfo);
} else if (typeInfo.getCategory().equals(ObjectInspector.Category.UNION)) {
throw new UnsupportedOperationException("Union type not implemented");
} else {
throw new IllegalArgumentException("Unknown type: " + typeInfo);
}
}
// An optional group containing a repeated anonymous group "bag", containing
// 1 anonymous element "array_element"
@SuppressWarnings("deprecation")
private static GroupType convertArrayType(final String name, final ListTypeInfo typeInfo) {
final TypeInfo subType = typeInfo.getListElementTypeInfo();
return new GroupType(Repetition.OPTIONAL, name, OriginalType.LIST, new GroupType(Repetition.REPEATED,
ParquetHiveSerDe.ARRAY.toString(), convertType("array_element", subType)));
}
// An optional group containing multiple elements
private static GroupType convertStructType(final String name, final StructTypeInfo typeInfo) {
final List<String> columnNames = typeInfo.getAllStructFieldNames();
final List<TypeInfo> columnTypes = typeInfo.getAllStructFieldTypeInfos();
return new GroupType(Repetition.OPTIONAL, name, convertTypes(columnNames, columnTypes));
}
// An optional group containing a repeated anonymous group "map", containing
// 2 elements: "key", "value"
private static GroupType convertMapType(final String name, final MapTypeInfo typeInfo) {
final Type keyType = convertType(ParquetHiveSerDe.MAP_KEY.toString(),
typeInfo.getMapKeyTypeInfo(), Repetition.REQUIRED);
final Type valueType = convertType(ParquetHiveSerDe.MAP_VALUE.toString(),
typeInfo.getMapValueTypeInfo());
return ConversionPatterns.mapType(Repetition.OPTIONAL, name, keyType, valueType);
}
}
4. 测试示例
示例代码
package com.study.spark.mr.utils;
import org.apache.hadoop.hive.serde2.typeinfo.*;
import org.apache.hadoop.security.UserGroupInformation;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.schema.GroupType;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.PrimitiveType;
import org.apache.parquet.schema.Type;
import org.apache.spark.sql.hive.orc.OrcOutputWriter;
import java.util.ArrayList;
import java.util.List;
public class ParquetSchemaTest {
public static void main(String[] args){
ParquetSchemaTest test = new ParquetSchemaTest();
test.messageType();
}
public void messageType(){
MessageType messageType = parquetSchema();
System.out.println("schema string = "+messageType.toString());
println(messageType.getFields());
}
public void println(List<Type> types){
for (Type type : types){
System.out.println("type name = "+ type.getName());
System.out.println("repetition type = "+ type.getRepetition().name());
if(type.getOriginalType() != null){
System.out.println("original type = " + type.getOriginalType().name());
}
//是否基础类型
boolean primitive = type.isPrimitive();
if(primitive){
System.out.println("primitive type name = "+type.asPrimitiveType().getName());
}else {
GroupType groupType = type.asGroupType();
println(groupType.getFields());
}
}
}
public MessageType parquetSchema(){
List<String> columnNames = new ArrayList<>();
List<TypeInfo> columnTypes = new ArrayList<>();
columnNames.add("name");
columnTypes.add(TypeInfoFactory.stringTypeInfo);
columnNames.add("age");
columnTypes.add(TypeInfoFactory.intTypeInfo);
columnNames.add("toatal");
columnTypes.add(new DecimalTypeInfo(22,2));
StructTypeInfo structTypeInfo = new StructTypeInfo();
structTypeInfo.setAllStructFieldNames(new ArrayList<>(columnNames));
structTypeInfo.setAllStructFieldTypeInfos(new ArrayList<>(columnTypes));
columnNames.add("struct_test");
columnTypes.add(structTypeInfo);
MapTypeInfo mapTypeInfo = new MapTypeInfo();
mapTypeInfo.setMapKeyTypeInfo(TypeInfoFactory.stringTypeInfo);
mapTypeInfo.setMapValueTypeInfo(TypeInfoFactory.floatTypeInfo);
columnNames.add("map_test");
columnTypes.add(mapTypeInfo);
ListTypeInfo listTypeInfo = new ListTypeInfo();
listTypeInfo.setListElementTypeInfo(TypeInfoFactory.stringTypeInfo);
columnNames.add("list_test");
columnTypes.add(listTypeInfo);
MessageType messageType = ParquetDataSchema.convert(columnNames,columnTypes);
return messageType;
}
}
测试结果
schema string = message hive_schema {
optional binary name (UTF8);
optional int32 age;
optional fixed_len_byte_array(10) toatal (DECIMAL(22,2));
optional group struct_test {
optional binary name (UTF8);
optional int32 age;
optional fixed_len_byte_array(10) toatal (DECIMAL(22,2));
}
optional group map_test (MAP) {
repeated group map (MAP_KEY_VALUE) {
required binary key (UTF8);
optional float value;
}
}
optional group list_test (LIST) {
repeated group bag {
optional binary array_element (UTF8);
}
}
}
type name = name
repetition type = OPTIONAL
original type = UTF8
primitive type name = name
type name = age
repetition type = OPTIONAL
primitive type name = age
type name = toatal
repetition type = OPTIONAL
original type = DECIMAL
primitive type name = toatal
type name = struct_test
repetition type = OPTIONAL
type name = name
repetition type = OPTIONAL
original type = UTF8
primitive type name = name
type name = age
repetition type = OPTIONAL
primitive type name = age
type name = toatal
repetition type = OPTIONAL
original type = DECIMAL
primitive type name = toatal
type name = map_test
repetition type = OPTIONAL
original type = MAP
type name = map
repetition type = REPEATED
original type = MAP_KEY_VALUE
type name = key
repetition type = REQUIRED
original type = UTF8
primitive type name = key
type name = value
repetition type = OPTIONAL
primitive type name = value
type name = list_test
repetition type = OPTIONAL
original type = LIST
type name = bag
repetition type = REPEATED
type name = array_element
repetition type = OPTIONAL
original type = UTF8
primitive type name = array_element
二、将数据转为Parquet输出格式
1.转输出数据
package com.study.spark.mr.utils;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe;
import org.apache.hadoop.hive.ql.io.parquet.timestamp.NanoTimeUtils;
import org.apache.hadoop.hive.serde2.io.DateWritable;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.io.api.Binary;
import org.apache.parquet.schema.MessageType;
import java.awt.*;
import java.sql.Date;
import java.sql.Timestamp;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
public class ParquetDataWrite {
public static Boolean booleanDataWriter(Boolean val) {
return val;
}
public static Integer byteDataWriter(byte val) {
return new Integer(val);
}
public static Integer shortDataWriter(Short val) {
return new Integer(val);
}
public static Integer intWriter(Integer val) {
return val;
}
public static Long longWriter(Long val) {
return val;
}
public static Float floatWriter(Float val) {
return val;
}
public static Double doubleDataWriter(Double val) {
return val;
}
public static Binary stringWriter(String val) {
return Binary.fromString(val);
}
public static Binary varcharWriter(String val) {
return Binary.fromString(val);
}
/**
* 将byte[]数据转为Binary,用于写入
*/
public static Binary binaryWrite(byte[] bytes) {
return Binary.fromByteArray(bytes);
}
/**
* 将时间戳Timestamp转为Binary,用于写入
*/
public static Binary timestampWrite(Timestamp ts) {
return NanoTimeUtils.getNanoTime(ts, false).toBinary();
}
/**
* 将字符串Decimal数据转为Binary,用于写入使用
*
* @param val 数据值
* @param prec 定义Decimal中的数据长度
* @param scale 定义Decimal中小数点后面位数
*/
public static Binary decimalWrite(String val, int prec, int scale) {
HiveDecimal hiveDecimal = HiveDecimal.create(val);
byte[] decimalBytes = hiveDecimal.bigIntegerBytesScaled(scale);
// Estimated number of bytes needed.
int precToBytes = ParquetHiveSerDe.PRECISION_TO_BYTE_COUNT[prec - 1];
if (precToBytes == decimalBytes.length) {
// No padding needed.
return Binary.fromByteArray(decimalBytes);
}
byte[] tgt = new byte[precToBytes];
if (hiveDecimal.signum() == -1) {
// For negative number, initializing bits to 1
for (int i = 0; i < precToBytes; i++) {
tgt[i] |= 0xFF;
}
}
System.arraycopy(decimalBytes, 0, tgt, precToBytes - decimalBytes.length, decimalBytes.length); // Padding leading zeroes/ones.
return Binary.fromByteArray(tgt);
}
/**
* 将Date数据类型转为Int
*/
public static Integer dateWrite(Date date) {
return Integer.valueOf(DateWritable.dateToDays(date));
}
/**
* list 数据类型转为Group
* @param group 主结构体
* @param index 为当前数据在结构体中的位置,也可以传入字段名称
* @param values 数组中的值,这里String只是示例,具体根据List立民安数据类型写入
* @return
*/
public static Group listWrite(Group group, int index,List<String> values){
Group listGroup = group.addGroup(index);
for(String v : values){
Group bagGroup = listGroup.addGroup(0);
bagGroup.add(0,v);
}
return group;
}
/**
* map 数据类型转为Group
* @param group 主结构体
* @param index 为当前数据在结构体中的位置,也可以传入字段名称
* @param values map中Key和value只是示例,具体根据定义Map结构传入
*/
public static Group mapWrite(Group group, int index, Map<String,String> values){
Group mapGroup = group.addGroup(index);
Iterator<String> iterable = values.keySet().iterator();
while (iterable.hasNext()){
String key = iterable.next();
String value = values.get(key);
Group dataGroup = mapGroup.addGroup(0);
dataGroup.add("key",key);
dataGroup.add("value",value);
}
return group;
}
/**
* Struct 结构转为Group
* @param group 主结构体
* @param index 为当前数据在结构体中的位置,也可以传入字段名称
* @param values 这里为示例,具体根据定义结构传入
* @return
*/
public static Group structWrite(Group group, int index,String[] values){
Group structGroup =group.addGroup(index);
for(int i = 0; i < values.length; i++){
structGroup.add(i,values[i]);
}
return group;
}
}
总结
- 在写入数据的时候Group数据类型中,传入参数只是一个示例。在这里面数据类型需要和定义的数据类型匹配。