Kafka Connect 是distributed模式,分别运行在192.168.1.204和192.168.1.100和192.168.1.200这三台机器上,开放了18083端口(因为默认的8083端口被占用了)用于向kafka connect的worker进程提交connector。
在用confluent hdfs sink connector把kafka中的数据导入到hdfs并导入到hive表时,connector配置如下:
{
"name":"dev_hdfs-sink",
"config":{
"connector.class":"io.confluent.connect.hdfs.HdfsSinkConnector",
"tasks.max":"1",
"topics":"user_option5",
"hdfs.url":"hdfs://192.168.1.204:14000",
"flush.size":"3",
"hive.integration":true,
"hive.database":"test_db",
"hive.metastore.uris":"thrift://192.168.1.204:9083",
"schema.compatibility":"BACKWARD"
}
}
在对hive表执行select的时候遇到了报错:
12: jdbc:hive2://localhost:10000> select * from user_option5;
19:41:29.901 [main] DEBUG org.apache.thrift.transport.TSaslTransport - writing data length: 133
19:41:29.958 [main] DEBUG org.apache.thrift.transport.TSaslTransport - CLIENT: reading data length: 4594
Error: Error while compiling statement: FAILED: RuntimeException MetaException(message:org.apache.hadoop.hive.serde2.avro.AvroSerdeException Schema for table must be of type RECORD. Received type: STRING) (state=42000,code=40000)
查看connector自动创建的hive表的建表语句:
12: jdbc:hive2://localhost:10000> show create table user_option5;
+--------------------------------------------------------------------+--+
| createtab_stmt |
+--------------------------------------------------------------------+--+
| CREATE EXTERNAL TABLE `user_option5`( |
| ) |
| PARTITIONED BY ( |
| `partition` string COMMENT '') |
| ROW FORMAT SERDE |
| 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' |
| STORED AS INPUTFORMAT |
| 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' |
| OUTPUTFORMAT |
| 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' |
| LOCATION |
| 'hdfs://192.168.1.204:14000/topics/user_option5' |
| TBLPROPERTIES ( |
| 'avro.schema.literal'='{"type":"string","connect.version":1}', |
| 'transient_lastDdlTime'='1533469246') |
+--------------------------------------------------------------------+——————————————————————————————————+
可以发现,avro.schema.literal中的type是string,应该是record类型。
报错的原因是KafkaProducer在发送消息的时候没有指定消息的schema信息。
在生产者代码中指定schema信息后就好了:
package com.superid.kafka.producer;
import io.confluent.kafka.serializers.KafkaAvroSerializerConfig;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericData;
import org.apache.avro.generic.GenericRecord;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerConfig;
import org.apache.kafka.clients.producer.ProducerRecord;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
import java.util.stream.IntStream;
/**
* @author
* @create: 2018-08-01 17:55
*/
public class SimuKafkaProducer {
public static void main(String[] args) {
Properties props = new Properties();
props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, "192.168.1.204:9092,192.168.1.100:9092,192.168.1.200:9092");
props.put("acks", "all");
props.put("retries", Integer.MAX_VALUE);
props.put("batch.size", 16384);
props.put("linger.ms", 1);
props.put("buffer.memory", 33554432);
props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, io.confluent.kafka.serializers.KafkaAvroSerializer.class);
props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, io.confluent.kafka.serializers.KafkaAvroSerializer.class);
// Schema Registry location.
props.put(KafkaAvroSerializerConfig.SCHEMA_REGISTRY_URL_CONFIG,
"http://192.168.1.204:18081,http://192.168.1.100:18081,http://192.168.1.200:18081");
KafkaProducer producer = new KafkaProducer(props);
//要把消息的schema信息列出来
String userOptionSchema = "{\"type\":\"record\",\"name\":\"user_option8\"," +
"\"fields\":[" +
"{\"name\":\"allianceId\",\"type\":\"long\"}," +
"{\"name\":\"affairId\",\"type\":\"long\"},"+
"{\"name\":\"userId\",\"type\":\"int\"},"+
"{\"name\":\"opType\",\"type\":\"string\"},"+
"{\"name\":\"beOperatedRoleId\",\"type\":\"string\"},"+
"{\"name\":\"attrs\",\"type\":{\"type\": \"map\", \"values\":\"string\"}}"+
"]}";
Schema.Parser parser = new Schema.Parser();
Schema schema = parser.parse(userOptionSchema);
try {
IntStream.range(1, 100).forEach(index -> {
Map<String, Object> map = new HashMap<>();
map.put("name", "pilaf");
map.put("age", index + 20+"");
GenericRecord avroRecord = new GenericData.Record(schema);
avroRecord.put("allianceId", 11L);
avroRecord.put("affairId",23L);
avroRecord.put("userId",12);
avroRecord.put("opType","c");
avroRecord.put("beOperatedRoleId","str111");
avroRecord.put("attrs",map);
//发送的ProducerRecord中要包着avroRecord
producer.send(new ProducerRecord("user_option8", index,avroRecord));
});
} catch (Exception e) {
// may need to do something with it
e.printStackTrace();
} finally {
producer.flush();
producer.close();
}
}
}
删掉原来的connector,在linux shell命令中执行:
curl -X DELETE localhost:18083/connectors/dev_hdfs-sink
重新配置一下connector(配置信息放在了文件dev_hdfs-sink.json中,换了一个topic名字):
{
"name":"dev_hdfs-sink",
"config":{
"connector.class":"io.confluent.connect.hdfs.HdfsSinkConnector",
"tasks.max":"1",
"topics":"user_option8",
"hdfs.url":"hdfs://192.168.1.204:14000",
"flush.size":"3",
"hive.integration":true,
"hive.database":"test_db",
"hive.metastore.uris":"thrift://192.168.1.204:9083",
"schema.compatibility":"BACKWARD"
}
}
再启动connector:
cd /home/tidb/confluent-4.1.1/etc/kafka-connect-hdfs
curl -X POST -H "Content-Type: application/json" --data @dev_hdfs-sink.json http://192.168.1.204:18083/connectors
启动connector后,运行producer程序向kafka的topic:user_option8写一些数据后,就会自动把消息放到hdfs上,并入hive表(建表都是自动完成的,不需要手动干预)。
此时,再查看connector自动创建的hive表的建表语句(show create table user_option8):
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--+
| createtab_stmt |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--+
| CREATE EXTERNAL TABLE `user_option8`( |
| `allianceid` bigint COMMENT '', |
| `affairid` bigint COMMENT '', |
| `userid` int COMMENT '', |
| `optype` string COMMENT '', |
| `beoperatedroleid` string COMMENT '', |
| `attrs` map<string,string> COMMENT '') |
| PARTITIONED BY ( |
| `partition` string COMMENT '') |
| ROW FORMAT SERDE |
| 'org.apache.hadoop.hive.serde2.avro.AvroSerDe' |
| STORED AS INPUTFORMAT |
| 'org.apache.hadoop.hive.ql.io.avro.AvroContainerInputFormat' |
| OUTPUTFORMAT |
| 'org.apache.hadoop.hive.ql.io.avro.AvroContainerOutputFormat' |
| LOCATION |
| 'hdfs://192.168.1.204:14000/topics/user_option8' |
| TBLPROPERTIES ( |
| 'avro.schema.literal'='{"type":"record","name":"user_option8","fields":[{"name":"allianceId","type":"long"},{"name":"affairId","type":"long"},{"name":"userId","type":"int"},{"name":"opType","type":"string"},{"name":"beOperatedRoleId","type":"string"},{"name":"attrs","type":{"type":"map","values":"string"}}],"connect.version":1,"connect.name":"user_option8"}', |
| 'transient_lastDdlTime'='1533525123') |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--+
可以看到,现在的建表语句中的avro.schema.literal中就包含了schema信息。
此时再查询Hive表,可以看到表中就有数据了:
| user_option8.allianceid | user_option8.affairid | user_option8.userid | user_option8.optype | user_option8.beoperatedroleid | user_option8.attrs | user_option8.partition |
+--------------------------+------------------------+----------------------+----------------------+--------------------------------+--------------------------------+-------------------------+--+
| 11 | 23 | 12 | c | str111 | {"name":"pilaf","age":"22"} | 0 |
| 11 | 23 | 12 | c | str111 | {"name":"pilaf","age":"23"} | 0 |
| 11 | 23 | 12 | c | str111 | {"name":"pilaf","age":"24"} | 0 |
| 11 | 23 | 12 | c | str111 | {"name":"pilaf","age":"25"} | 0 |
| 11 | 23 | 12 | c | str111 | {"name":"pilaf","age":"26"} | 0 |
| 11 | 23 | 12 | c | str111 | {"name":"pilaf","age":"27"} | 0 |
+--------------------------+------------------------+----------------------+----------------------+--------------------------------+--------------------------------+-------------------------+--+
看到上面的schema字符串很长很恶心,为了方便构造schema字符串,笔者做了一个简单的封装:
package com.superid.entity;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* @author dufeng
* @create: 2018-08-06 13:55
*/
public class MySchema {
/**
* 用于Hive表
*/
public static final String RECORD_TYPE = "record";
/**
* schema的type类型,一般用record
*/
private String type;
/**
* schema的名字信息
*/
private String name;
/**
* schema中的字段
*/
private List<Map<String, String>> fields = new ArrayList<>();
/**
* 不对外提供公开构造器,让使用者通过builder构造
*/
private MySchema(){
}
private MySchema(Builder builder) {
type = builder.type;
name = builder.name;
fields = builder.fields;
}
public static final class Builder {
private String type;
private String name;
private List<Map<String, String>> fields = new ArrayList<>();
public Builder() {
}
public Builder type(String val) {
type = val;
return this;
}
public Builder name(String val) {
name = val;
return this;
}
public Builder fields(List<Map<String, String>> fieldMap) {
fields = fieldMap;
return this;
}
public Builder field(String name,String type) {
type = type.toLowerCase();
Map<String,String> aFieldMap = new HashMap<>();
aFieldMap.put("name",name);
aFieldMap.put("type",type);
fields.add(aFieldMap);
return this;
}
public Builder field(Map<String, String> map) {
fields.add(map);
return this;
}
public MySchema build() {
return new MySchema(this);
}
}
@Override
public String toString() {
StringBuilder fieldStr = new StringBuilder("[");
for(Map<String,String> map:fields){
fieldStr.append("{");
fieldStr.append("\"name\":").append("\"").append(map.get("name")).append("\",");
if(map.get("type").startsWith("{")){
fieldStr.append("\"type\":").append(map.get("type"));
}else {
fieldStr.append("\"type\":").append("\"").append(map.get("type")).append("\"");
}
fieldStr.append("}");
fieldStr.append(",");
}
//删除最后一个多余的逗号
fieldStr.deleteCharAt(fieldStr.length()-1);
fieldStr.append("]");
return "{" +
"\"type\":\"" + type + '\"' +
",\"name\":\"" + name + '\"' +
",\"fields\":" + fieldStr.toString() +
'}';
}
}
这样可以通过如下代码构造schema字符串,尽量减小对业务代码的污染:
String userOptionSchema = new MySchema.Builder()
.type(MySchema.RECORD_TYPE)
.name("user_option8")
.field("allianceId","long")
.field("affairId","long")
.field("userId","int")
.field("opType","string")
.field("beOperatedRoleId","string")
.field("attrs","{\"type\": \"map\", \"values\":\"string\"}")
.build()
.toString();
Schema.Parser parser = new Schema.Parser();
Schema schema = parser.parse(userOptionSchema);