解决Kafka HDFS Sink Connector Integrated with Hive 报错

最新推荐文章于 2024-08-09 00:52:28 发布

pilaf1990

最新推荐文章于 2024-08-09 00:52:28 发布

阅读量1.8k

点赞数 2

分类专栏：大数据 kafka hive 文章标签： kafka connect 大数据 hive avro

本文链接：https://blog.csdn.net/lzufeng/article/details/81452262

版权

kafka 同时被 3 个专栏收录

13 篇文章 1 订阅

订阅专栏

大数据

10 篇文章 1 订阅

订阅专栏

hive

2 篇文章 0 订阅

订阅专栏

Kafka Connect 是distributed模式，分别运行在192.168.1.204和192.168.1.100和192.168.1.200这三台机器上，开放了18083端口（因为默认的8083端口被占用了）用于向kafka connect的worker进程提交connector。

在用confluent hdfs sink connector把kafka中的数据导入到hdfs并导入到hive表时，connector配置如下：

{
        "name":"dev_hdfs-sink",
        "config":{
                "connector.class":"io.confluent.connect.hdfs.HdfsSinkConnector",
                "tasks.max":"1",
                "topics":"user_option5",
                "hdfs.url":"hdfs://192.168.1.204:14000",
                "flush.size":"3",
                "hive.integration":true,
                "hive.database":"test_db",
                "hive.metastore.uris":"thrift://192.168.1.204:9083",
                "schema.compatibility":"BACKWARD"
        }
}

在对hive表执行select的时候遇到了报错：

12: jdbc:hive2://localhost:10000> select * from user_option5;
19:41:29.901 [main] DEBUG org.apache.thrift.transport.TSaslTransport - writing data length: 133
19:41:29.958 [main] DEBUG org.apache.thrift.transport.TSaslTransport - CLIENT: reading data length: 4594
Error: Error while compiling statement: FAILED: RuntimeException MetaException(message:org.apache.hadoop.hive.serde2.avro.AvroSerdeException Schema for table must be of type RECORD. Received type: STRING) (state=42000,code=40000)

查看connector自动创建的hive表的建表语句：

12: jdbc:hive2://localhost:10000> show create table user_option5;
+--------------------------------------------------------------------+--+
|                           createtab_stmt                           |
+--------------------------------------------------------------------+--+
| CREATE EXTERNAL TABLE `user_option5`(                              |
| )                                                                  |
| PARTITIONED BY (                                                   |
|   `partition` string COMMENT '')                                   |
| ROW FORMAT SERDE                                                   |
|   'org.apache.hadoop.hive.serde2.avro.AvroSerDe'                   |
| STORED AS INPUTFORMAT                                              |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'     |
| OUTPUTFORMAT                                                       |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'    |
| LOCATION                                                           |
|   'hdfs://192.168.1.204:14000/topics/user_option5'                 |
| TBLPROPERTIES (                                                    |
|   'avro.schema.literal'='{"type":"string","connect.version":1}',   |
|   'transient_lastDdlTime'='1533469246')                            |
+--------------------------------------------------------------------+——————————————————————————————————+

可以发现，avro.schema.literal中的type是string，应该是record类型。

报错的原因是KafkaProducer在发送消息的时候没有指定消息的schema信息。
在生产者代码中指定schema信息后就好了：

package com.superid.kafka.producer;

import io.confluent.kafka.serializers.KafkaAvroSerializerConfig;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;

import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.stream.IntStream;

/**
 * @author 
 * @create: 2018-08-01 17:55
 */
public class SimuKafkaProducer {

    public static void main(String[] args) {
        Properties props = new Properties();

        props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.1.204:9092,192.168.1.100:9092,192.168.1.200:9092");
        props.put("acks", "all");
        props.put("retries", Integer.MAX_VALUE);
        props.put("batch.size", 16384);
        props.put("linger.ms", 1);
        props.put("buffer.memory", 33554432);
        props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, io.confluent.kafka.serializers.KafkaAvroSerializer.class);
        props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, io.confluent.kafka.serializers.KafkaAvroSerializer.class);

        // Schema Registry location.
        props.put(KafkaAvroSerializerConfig.SCHEMA_REGISTRY_URL_CONFIG,
                "http://192.168.1.204:18081,http://192.168.1.100:18081,http://192.168.1.200:18081");

        KafkaProducer producer = new KafkaProducer(props);
        //要把消息的schema信息列出来
        String userOptionSchema = "{\"type\":\"record\",\"name\":\"user_option8\"," +
                "\"fields\":[" +
                "{\"name\":\"allianceId\",\"type\":\"long\"}," +
                "{\"name\":\"affairId\",\"type\":\"long\"},"+
                "{\"name\":\"userId\",\"type\":\"int\"},"+
                "{\"name\":\"opType\",\"type\":\"string\"},"+
                "{\"name\":\"beOperatedRoleId\",\"type\":\"string\"},"+
                "{\"name\":\"attrs\",\"type\":{\"type\": \"map\", \"values\":\"string\"}}"+
                "]}";

        Schema.Parser parser = new Schema.Parser();
        Schema schema = parser.parse(userOptionSchema);


        try {
            IntStream.range(1, 100).forEach(index -> {
                Map<String, Object> map = new HashMap<>();
                map.put("name", "pilaf");
                map.put("age", index + 20+"");

                GenericRecord avroRecord = new GenericData.Record(schema);
                avroRecord.put("allianceId", 11L);
                avroRecord.put("affairId",23L);
                avroRecord.put("userId",12);
                avroRecord.put("opType","c");
                avroRecord.put("beOperatedRoleId","str111");
                avroRecord.put("attrs",map);
                //发送的ProducerRecord中要包着avroRecord
                producer.send(new ProducerRecord("user_option8", index,avroRecord));

            });
        } catch (Exception e) {
            // may need to do something with it
            e.printStackTrace();
        } finally {
            producer.flush();
            producer.close();
        }

    }
}

删掉原来的connector，在linux shell命令中执行：

curl -X DELETE localhost:18083/connectors/dev_hdfs-sink

重新配置一下connector（配置信息放在了文件dev_hdfs-sink.json中，换了一个topic名字）：

{
        "name":"dev_hdfs-sink",
        "config":{
                "connector.class":"io.confluent.connect.hdfs.HdfsSinkConnector",
                "tasks.max":"1",
                "topics":"user_option8",
                "hdfs.url":"hdfs://192.168.1.204:14000",
                "flush.size":"3",
                "hive.integration":true,
                "hive.database":"test_db",
                "hive.metastore.uris":"thrift://192.168.1.204:9083",
                "schema.compatibility":"BACKWARD"
        }
}

再启动connector：

cd  /home/tidb/confluent-4.1.1/etc/kafka-connect-hdfs
curl -X POST -H "Content-Type: application/json" --data @dev_hdfs-sink.json http://192.168.1.204:18083/connectors

启动connector后，运行producer程序向kafka的topic：user_option8写一些数据后，就会自动把消息放到hdfs上，并入hive表（建表都是自动完成的，不需要手动干预）。

此时，再查看connector自动创建的hive表的建表语句（show create table user_option8）：

+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--+
|                                                                                                                                                                                createtab_stmt                                                                                                                                                                                 |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--+
| CREATE EXTERNAL TABLE `user_option8`(                                                                                                                                                                                                                                                                                                                                         |
|   `allianceid` bigint COMMENT '',                                                                                                                                                                                                                                                                                                                                             |
|   `affairid` bigint COMMENT '',                                                                                                                                                                                                                                                                                                                                               |
|   `userid` int COMMENT '',                                                                                                                                                                                                                                                                                                                                                    |
|   `optype` string COMMENT '',                                                                                                                                                                                                                                                                                                                                                 |
|   `beoperatedroleid` string COMMENT '',                                                                                                                                                                                                                                                                                                                                       |
|   `attrs` map<string,string> COMMENT '')                                                                                                                                                                                                                                                                                                                                      |
| PARTITIONED BY (                                                                                                                                                                                                                                                                                                                                                              |
|   `partition` string COMMENT '')                                                                                                                                                                                                                                                                                                                                              |
| ROW FORMAT SERDE                                                                                                                                                                                                                                                                                                                                                              |
|   'org.apache.hadoop.hive.serde2.avro.AvroSerDe'                                                                                                                                                                                                                                                                                                                              |
| STORED AS INPUTFORMAT                                                                                                                                                                                                                                                                                                                                                         |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat'                                                                                                                                                                                                                                                                                                                |
| OUTPUTFORMAT                                                                                                                                                                                                                                                                                                                                                                  |
|   'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat'                                                                                                                                                                                                                                                                                                               |
| LOCATION                                                                                                                                                                                                                                                                                                                                                                      |
|   'hdfs://192.168.1.204:14000/topics/user_option8'                                                                                                                                                                                                                                                                                                                            |
| TBLPROPERTIES (                                                                                                                                                                                                                                                                                                                                                               |
|   'avro.schema.literal'='{"type":"record","name":"user_option8","fields":[{"name":"allianceId","type":"long"},{"name":"affairId","type":"long"},{"name":"userId","type":"int"},{"name":"opType","type":"string"},{"name":"beOperatedRoleId","type":"string"},{"name":"attrs","type":{"type":"map","values":"string"}}],"connect.version":1,"connect.name":"user_option8"}',   |
|   'transient_lastDdlTime'='1533525123')                                                                                                                                                                                                                                                                                                                                       |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--+

可以看到，现在的建表语句中的avro.schema.literal中就包含了schema信息。

此时再查询Hive表，可以看到表中就有数据了：

| user_option8.allianceid  | user_option8.affairid  | user_option8.userid  | user_option8.optype  | user_option8.beoperatedroleid  |       user_option8.attrs       | user_option8.partition  |
+--------------------------+------------------------+----------------------+----------------------+--------------------------------+--------------------------------+-------------------------+--+
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"22"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"23"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"24"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"25"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"26"}    | 0                       |
| 11                       | 23                     | 12                   | c                    | str111                         | {"name":"pilaf","age":"27"}    | 0                       |
+--------------------------+------------------------+----------------------+----------------------+--------------------------------+--------------------------------+-------------------------+--+

看到上面的schema字符串很长很恶心，为了方便构造schema字符串，笔者做了一个简单的封装：

package com.superid.entity;

import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * @author dufeng
 * @create: 2018-08-06 13:55
 */
public class MySchema {
    /**
     * 用于Hive表
     */
    public static final String RECORD_TYPE = "record";

    /**
     * schema的type类型，一般用record
     */
    private String type;
    /**
     * schema的名字信息
     */
    private String name;
    /**
     * schema中的字段
     */
    private List<Map<String, String>> fields = new ArrayList<>();

    /**
     * 不对外提供公开构造器，让使用者通过builder构造
     */
    private MySchema(){

    }


    private MySchema(Builder builder) {
        type = builder.type;
        name = builder.name;
        fields = builder.fields;
    }


    public static final class Builder {
        private String type;
        private String name;
        private List<Map<String, String>> fields = new ArrayList<>();

        public Builder() {
        }

        public Builder type(String val) {
            type = val;
            return this;
        }

        public Builder name(String val) {
            name = val;
            return this;
        }

        public Builder fields(List<Map<String, String>> fieldMap) {
            fields = fieldMap;
            return this;
        }

        public Builder field(String name,String type) {
            type = type.toLowerCase();

            Map<String,String> aFieldMap = new HashMap<>();
            aFieldMap.put("name",name);
            aFieldMap.put("type",type);
            fields.add(aFieldMap);

            return this;
        }

        public Builder field(Map<String, String> map) {
            fields.add(map);
            return this;
        }

        public MySchema build() {
            return new MySchema(this);
        }
    }


    @Override
    public String toString() {
        StringBuilder fieldStr = new StringBuilder("[");
        for(Map<String,String> map:fields){
            fieldStr.append("{");
            fieldStr.append("\"name\":").append("\"").append(map.get("name")).append("\",");
            if(map.get("type").startsWith("{")){
                fieldStr.append("\"type\":").append(map.get("type"));
            }else {
                fieldStr.append("\"type\":").append("\"").append(map.get("type")).append("\"");
            }

            fieldStr.append("}");
            fieldStr.append(",");
        }

        //删除最后一个多余的逗号
        fieldStr.deleteCharAt(fieldStr.length()-1);

        fieldStr.append("]");

        return "{" +
                "\"type\":\"" + type + '\"' +
                ",\"name\":\"" + name + '\"' +
                ",\"fields\":" + fieldStr.toString() +
                '}';
    }
}

这样可以通过如下代码构造schema字符串，尽量减小对业务代码的污染：

        String userOptionSchema = new MySchema.Builder()
                .type(MySchema.RECORD_TYPE)
                .name("user_option8")
                .field("allianceId","long")
                .field("affairId","long")
                .field("userId","int")
                .field("opType","string")
                .field("beOperatedRoleId","string")
                .field("attrs","{\"type\": \"map\", \"values\":\"string\"}")
                .build()
                .toString();

        Schema.Parser parser = new Schema.Parser();
        Schema schema = parser.parse(userOptionSchema);