大数据之AVRO转ORC

1 篇文章 0 订阅

背景

在大数据流行至今,出现了多种文件格式,其中ORC(S3 存储)和AVRO(confluent kafka)有着广泛的应用,今天有意整理了这两种文件格式的转换的基于Java语言的具体代码实现。

Maven依赖

 <dependencies>
        <dependency>
            <groupId>org.apache.orc</groupId>
            <artifactId>orc-core</artifactId>
            <version>1.7.6</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.6.0</version>
        </dependency>

<!--        <dependency>-->
<!--            <groupId>org.apache.hadoop</groupId>-->
<!--            <artifactId>hadoop-hdfs</artifactId>-->
<!--            <version>3.3.2</version>-->
<!--        </dependency>-->

        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>27.0.1-jre</version>
        </dependency>


        <dependency>
            <groupId>org.apache.avro</groupId>
            <artifactId>avro</artifactId>
            <version>1.9.2</version>
        </dependency>
        <dependency>
            <groupId>org.springframework</groupId>
            <artifactId>spring-core</artifactId>
            <version>5.2.1.RELEASE</version>
            <scope>compile</scope>
        </dependency>

        <dependency>
            <groupId>com.google.protobuf</groupId>
            <artifactId>protobuf-java-util</artifactId>
            <version>3.20.0</version>
        </dependency>

        <dependency>
            <groupId>net.minidev</groupId>
            <artifactId>json-smart</artifactId>
            <version>2.4.7</version>
        </dependency>
    </dependencies>

code

public class AvroAndOrcTest {
    public static void main(String[] args) throws IOException {
        System.setProperty("hadoop.home.dir", "C:\\Users\\lv_ry601\\Desktop\\winutils_jb51\\winutils-master\\hadoop-2.6.3");
        String avroFile = "test.avro";
        String avroSchema = "avro_schema.txt";
        String targetOrcFile = "target.orc";

        String orcSchemaString = "struct<testString:string,testInt:int,testDouble:double>";
        TypeDescription orcSchema = TypeDescription.fromString(orcSchemaString);

        avroGenerate(avroSchema,avroFile,"10");
        readAvroWriteOrc(new Schema.Parser().parse(new File(avroSchema)),orcSchema,avroFile,targetOrcFile);
        readOrc(targetOrcFile,orcSchema);

    }

    private static void readAvroWriteOrc(Schema avroSchema, TypeDescription orcSchema, String avroFile, String targetOrcFile) {
        VectorColumnFiller.JsonConverter[] jsonConverters = new VectorColumnFiller.JsonConverter[orcSchema.getChildren().size()];
        int i = 0;
        for (TypeDescription child : orcSchema.getChildren()) {
            jsonConverters[i++] = VectorColumnFiller.createConverter(child);
        }
        VectorizedRowBatch rowBatch = orcSchema.createRowBatch();
        OrcFile.WriterOptions writerOptions = OrcFile.writerOptions(new Configuration());
        writerOptions.setSchema(orcSchema);
        try(Writer writer = OrcFile.createWriter(new Path(targetOrcFile),writerOptions)){
            DataFileReader<GenericData.Record> reader = new DataFileReader(new File(avroFile),new GenericDatumReader(avroSchema));
            while (reader.hasNext()){
                GenericData.Record rowData = reader.next();
                JsonElement jsonElement = new JsonParser().parse(rowData.toString());
                if(jsonElement instanceof JsonObject) {
                    System.out.println(jsonElement);
                    VectorColumnFiller.fillRow(rowBatch.size, jsonConverters, orcSchema, rowBatch, (JsonObject) jsonElement);
                    if(rowBatch.size == rowBatch.getMaxSize()){
                        writer.addRowBatch(rowBatch);
                        rowBatch.reset();
                    }
                }
            }
            writer.addRowBatch(rowBatch);
        }catch (Exception e){
            System.out.println(e);
        }
    }



    public static  void avroGenerate(String schemaFileName,String avroFile,String count){
        try {
            String[] args = {schemaFileName,avroFile,count};
            RandomData.main(args);
        } catch (Exception e) {
            System.out.println("error occurred");
        }
    }
    public static void readOrc(String path, TypeDescription schema){
        Configuration configuration = new Configuration();
        OrcFile.ReaderOptions readerOptions = new OrcFile.ReaderOptions(configuration);
        VectorizedRowBatch rowBatch = schema.createRowBatch();
        System.out.println();
        System.out.println();
        System.out.println("read written files----------------------------------");
        try (Reader reader = OrcFile.createReader(new Path(path), readerOptions)) {
            RecordReader rows = reader.rows();
            while (rows.nextBatch(rowBatch)){
                System.out.println(rowBatch);
            }
        }catch (Exception e){
            System.out.println("error occurred");
        }


    }
    }



import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonPrimitive;
import org.apache.hadoop.hive.common.type.HiveDecimal;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.ListColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.MapColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.UnionColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.TypeDescription;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.nio.charset.StandardCharsets;
import java.sql.Timestamp;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * 
 * @author Ashish (ashu.impetus@gmail.com)
 *
 */
public class VectorColumnFiller {
    private static final Logger LOG = LoggerFactory.getLogger(VectorColumnFiller.class);

    public interface JsonConverter {
        void convert(JsonElement value, ColumnVector vect, int row);
    }

    static class BooleanColumnConverter implements JsonConverter {
        public void convert(JsonElement value, ColumnVector vect, int row) {
            if (value == null || value.isJsonNull()) {
                vect.noNulls = false;
                vect.isNull[row] = true;
            } else {
                LongColumnVector vector = (LongColumnVector) vect;
                vector.vector[row] = value.getAsBoolean() ? 1 : 0;
            }
        }
    }

    static class LongColumnConverter implements JsonConverter {
        public void convert(JsonElement value, ColumnVector vect, int row) {
            if (value == null || value.isJsonNull()) {
                vect.noNulls = false;
                vect.isNull[row] = true;
            } else {
                LongColumnVector vector = (LongColumnVector) vect;
                vector.vector[row] = value.getAsLong();
            }
        }
    }

    static class DoubleColumnConverter implements JsonConverter {
        public void convert(JsonElement value, ColumnVector vect, int row) {
            if (value == null || value.isJsonNull()) {
                vect.noNulls = false;
                vect.isNull[row] = true;
            } else {
                DoubleColumnVector vector = (DoubleColumnVector) vect;
                vector.vector[row] = value.getAsDouble();
            }
        }
    }

    static class StringColumnConverter implements JsonConverter {
        public void convert(JsonElement value, ColumnVector vect, int row) {
            if (value == null || value.isJsonNull()) {
                vect.noNulls = false;
                vect.isNull[row] = true;
            } else {
                BytesColumnVector vector = (BytesColumnVector) vect;
                byte[] bytes = value.getAsString().getBytes(
                        StandardCharsets.UTF_8);
                vector.setRef(row, bytes, 0, bytes.length);
            }
        }
    }

    static class BinaryColumnConverter implements JsonConverter {
        public void convert(JsonElement value, ColumnVector vect, int row) {
            if (value == null || value.isJsonNull()) {
                vect.noNulls = false;
                vect.isNull[row] = true;
            } else {
                BytesColumnVector vector = (BytesColumnVector) vect;
                String binStr = value.getAsString();
                byte[] bytes = new byte[binStr.length() / 2];
                for (int i = 0; i < bytes.length; ++i) {
                    bytes[i] = (byte) Integer.parseInt(
                            binStr.substring(i * 2, i * 2 + 2), 16);
                }
                vector.setRef(row, bytes, 0, bytes.length);
            }
        }
    }

    static class TimestampColumnConverter implements JsonConverter {
        BackOffUtil back = new BackOffUtil(true);

        public void convert(JsonElement value, ColumnVector vect, int row) {
            if (value == null || value.isJsonNull()) {
                vect.noNulls = false;
                vect.isNull[row] = true;
            } else {
                if (value.getAsJsonPrimitive().isString()) {
                    TimestampColumnVector vector = (TimestampColumnVector) vect;
                    vector.set(
                            row,
                            Timestamp.valueOf(value.getAsString().replaceAll(
                                    "[TZ]", " ")));
                } else if (value.getAsJsonPrimitive().isNumber()) {
                    TimestampColumnVector vector = (TimestampColumnVector) vect;
                    vector.set(
                            row,
                            new Timestamp(value.getAsLong()));
                } else {
                    if (!back.isBackOff()) {
                        LOG.warn("Timestamp is neither string nor number: {}", value);
                    }
                    vect.noNulls = false;
                    vect.isNull[row] = true;
                }
            }
        }
    }

    static class DecimalColumnConverter implements JsonConverter {
        public void convert(JsonElement value, ColumnVector vect, int row) {
            if (value == null || value.isJsonNull()) {
                vect.noNulls = false;
                vect.isNull[row] = true;
            } else {
                DecimalColumnVector vector = (DecimalColumnVector) vect;
                vector.vector[row].set(HiveDecimal.create(value.getAsString()));
            }
        }
    }

//    static class MapColumnConverter implements JsonConverter {
//        private JsonConverter[] childConverters;
//
//        public MapColumnConverter(TypeDescription schema) {
//            assertKeyType(schema);
//
//            List<TypeDescription> childTypes = schema.getChildren();
//            childConverters = new JsonConverter[childTypes.size()];
//            for (int c = 0; c < childConverters.length; ++c) {
//                childConverters[c] = createConverter(childTypes.get(c));
//            }
//        }
//
//        /**
//         * Rejects non-string keys. This is a limitation imposed by JSON specifications that only allows strings
//         * as keys.
//         */
//        private void assertKeyType(TypeDescription schema) {
//            // NOTE: It may be tempting to ensure that schema.getChildren() returns at least one child here, but the
//            // validity of an ORC schema is ensured by TypeDescription. Malformed ORC schema could be a concern.
//            // For example, an ORC schema of `map<>` may produce a TypeDescription instance with no child. However,
//            // TypeDescription.fromString() rejects any malformed ORC schema and therefore we may assume only valid
//            // ORC schema will make to this point.
//            TypeDescription keyType = schema.getChildren().get(0);
//            String keyTypeName = keyType.getCategory().getName();
//            if (!keyTypeName.equalsIgnoreCase("string")) {
//                throw new IllegalArgumentException(
//                        String.format("Unsupported key type: %s", keyTypeName));
//            }
//        }
//
//        public void convert(JsonElement value, ColumnVector vect, int row) {
//            if (value == null || value.isJsonNull()) {
//                vect.noNulls = false;
//                vect.isNull[row] = true;
//            } else {
//                MapColumnVector vector = (MapColumnVector) vect;
//                JsonObject obj = value.getAsJsonObject();
//                vector.lengths[row] = obj.size();
//                vector.offsets[row] = row > 0 ? vector.offsets[row - 1] + vector.lengths[row - 1] : 0;
//
//                // Ensure enough space is available to store the keys and the values
//                vector.keys.ensureSize((int) vector.offsets[row] + obj.size(), true);
//                vector.values.ensureSize((int) vector.offsets[row] + obj.size(), true);
//
//                int i = 0;
//                for (String key : obj.keySet()) {
//                    childConverters[0].convert(new JsonPrimitive(key), vector.keys, (int) vector.offsets[row] + i);
//                    childConverters[1].convert(obj.get(key), vector.values, (int) vector.offsets[row] + i);
//                    i++;
//                }
//            }
//        }
//    }

    /**
     * The primary challenge here is that available type information at the time of class instantiation and at the
     * time of invocation of {@code convert()} is different. We have exact type information when
     * {@code UnionColumnConverter} is instantiated, as it is given as {@code TypeDescription} which represents an
     * ORC schema. Conversely, when {@code convert()} method is called, limited type information is available because
     * JSON supports three primitive types only: boolean, number, and string.
     *
     * The proposed solution for this issue is to register appropriate converters at the time of instantiation with
     * a matching {@code ColumnVector} index. Note that {@code UnionColumnVector} has child column vectors to support
     * each of its child type.
     */
    static class UnionColumnConverter implements JsonConverter {

        private enum JsonType {
            NULL, BOOLEAN, NUMBER, STRING, ARRAY, OBJECT
        }

        // TODO: Could we come up with a better name?
        private class ConverterInfo {
            private int vectorIndex;
            private JsonConverter converter;

            public ConverterInfo(int vectorIndex, JsonConverter converter) {
                this.vectorIndex = vectorIndex;
                this.converter = converter;
            }

            public int getVectorIndex() {
                return vectorIndex;
            }

            public JsonConverter getConverter() {
                return converter;
            }
        }

        /**
         * Union type in ORC is essentially a collection of two or more non-compatible types,
         * and it is represented by multiple child columns under UnionColumnVector.
         * Thus we need converters for each type.
         */
        private Map<JsonType, ConverterInfo> childConverters = new HashMap<>();

        public UnionColumnConverter(TypeDescription schema) {
            List<TypeDescription> children = schema.getChildren();
            int index = 0;
            for (TypeDescription childType : children) {
                JsonType jsonType = getJsonType(childType.getCategory());
                JsonConverter converter = createConverter(childType);
                // FIXME: Handle cases where childConverters is pre-occupied with the same mask
                childConverters.put(jsonType, new ConverterInfo(index++, converter));
            }
        }

        private JsonType getJsonType(TypeDescription.Category category) {
            switch (category) {
                case BOOLEAN:
                    return JsonType.BOOLEAN;
                case BYTE:
                case SHORT:
                case INT:
                case LONG:
                case FLOAT:
                case DOUBLE:
                case DECIMAL:
                    return JsonType.NUMBER;
                case CHAR:
                case VARCHAR:
                case STRING:
                    return JsonType.STRING;
                default:
                    throw new UnsupportedOperationException();
            }
        }

        private JsonType getJsonType(JsonPrimitive value) {
            if (value.isBoolean()) {
                return JsonType.BOOLEAN;
            } else if (value.isNumber()) {
                return JsonType.NUMBER;
            } else if (value.isString()) {
                return JsonType.STRING;
            } else {
                throw new UnsupportedOperationException();
            }
        }

        public void convert(JsonElement value, ColumnVector vect, int row) {
            if (value == null || value.isJsonNull()) {
                vect.noNulls = false;
                vect.isNull[row] = true;
            } else if (value.isJsonPrimitive()) {
                UnionColumnVector vector = (UnionColumnVector) vect;
                JsonPrimitive primitive = value.getAsJsonPrimitive();

                JsonType jsonType = getJsonType(primitive);
                ConverterInfo converterInfo = childConverters.get(jsonType);
                if (converterInfo == null) {
                    String message = String.format("Unable to infer type for '%s'", primitive);
                    throw new IllegalArgumentException(message);
                }

                int vectorIndex = converterInfo.getVectorIndex();
                JsonConverter converter = converterInfo.getConverter();
                vector.tags[row] = vectorIndex;
                converter.convert(value, vector.fields[vectorIndex], row);
            } else {
                // It would be great to support non-primitive types in union type.
                // Let's leave this for another PR in the future.
                throw new UnsupportedOperationException();
            }
        }
    }

    static class StructColumnConverter implements JsonConverter {
        private JsonConverter[] childrenConverters;
        private List<String> fieldNames;

        public StructColumnConverter(TypeDescription schema) {
            List<TypeDescription> kids = schema.getChildren();
            childrenConverters = new JsonConverter[kids.size()];
            for (int c = 0; c < childrenConverters.length; ++c) {
                childrenConverters[c] = createConverter(kids.get(c));
            }
            fieldNames = schema.getFieldNames();
        }

        public void convert(JsonElement value, ColumnVector vect, int row) {
            if (value == null || value.isJsonNull()) {
                vect.noNulls = false;
                vect.isNull[row] = true;
            } else {
                StructColumnVector vector = (StructColumnVector) vect;
                JsonObject obj = value.getAsJsonObject();
                for (int c = 0; c < childrenConverters.length; ++c) {
                    JsonElement elem = obj.get(fieldNames.get(c));
                    childrenConverters[c].convert(elem, vector.fields[c], row);
                }
            }
        }
    }

    static class ListColumnConverter implements JsonConverter {
        private JsonConverter childrenConverter;

        public ListColumnConverter(TypeDescription schema) {
            childrenConverter = createConverter(schema.getChildren().get(0));
        }

        public void convert(JsonElement value, ColumnVector vect, int row) {
            if (value == null || value.isJsonNull()) {
                vect.noNulls = false;
                vect.isNull[row] = true;
            } else {
                ListColumnVector vector = (ListColumnVector) vect;
                JsonArray obj = value.getAsJsonArray();
                vector.lengths[row] = obj.size();
                vector.offsets[row] = vector.childCount;
                vector.childCount += vector.lengths[row];
                vector.child.ensureSize(vector.childCount, true);
                for (int c = 0; c < obj.size(); ++c) {
                    childrenConverter.convert(obj.get(c), vector.child,
                            (int) vector.offsets[row] + c);
                }
            }
        }
    }

    public static JsonConverter createConverter(TypeDescription schema) {
        switch (schema.getCategory()) {
        case BYTE:
        case SHORT:
        case INT:
        case LONG:
            return new LongColumnConverter();
        case FLOAT:
        case DOUBLE:
            return new DoubleColumnConverter();
        case CHAR:
        case VARCHAR:
        case STRING:
            return new StringColumnConverter();
        case DECIMAL:
            return new DecimalColumnConverter();
        case TIMESTAMP:
            return new TimestampColumnConverter();
        case BINARY:
            return new BinaryColumnConverter();
        case BOOLEAN:
            return new BooleanColumnConverter();
        case STRUCT:
            return new StructColumnConverter(schema);
        case LIST:
            return new ListColumnConverter(schema);
        case MAP:
//            return new MapColumnConverter(schema);
        case UNION:
            return new UnionColumnConverter(schema);
        default:
            throw new IllegalArgumentException("Unhandled type " + schema);
        }
    }

    public static void fillRow(int rowIndex, JsonConverter[] converters,
            TypeDescription schema, VectorizedRowBatch batch, JsonObject data) {
        List<String> fieldNames = schema.getFieldNames();
        for (int c = 0; c < converters.length; ++c) {
            JsonElement fieldValue = data.get(fieldNames.get(c));
            if (fieldValue == null) {
                batch.cols[c].noNulls = false;
                batch.cols[c].isNull[rowIndex] = true;
            } else {
                converters[c].convert(fieldValue, batch.cols[c], rowIndex);
            }
        }
        batch.size++;
    }
}

结语

这算是最通用的使用这两种文件API进行ORC与AVRO转换的code了,值得注意的是,在window读写ORC文件,需要使用winutils.exe这个插件,需要自行下载。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值