文档https://mp.csdn.net/postedit/81179811 记录简单读取kakfa数据,因公司业务数据为特定格式数据,因此需要自定义schema类读取,也就是截取前4个字节获取schemaId,然后根据schemaId读取schema定义的字符串,进而还原存储在topic中的网络日志数据。
直接代码:从zk中读取所有的schema配置
package com.matthew.flink;
import com.matthew.util.ZkClientTemplate;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.Serializable;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by developer on 7/19/18.
*/
public class SchemaMap implements Serializable {
private static final Logger logger = LoggerFactory.getLogger(SchemaMap.class);
private Map<Integer, String> schemaMap = new HashMap<Integer, String>();
public SchemaMap(String zkServer, String avroPath) {
ZkClientTemplate zookeeperTemplate = new ZkClientTemplate();
zookeeperTemplate.setZkServers(zkServer);
zookeeperTemplate.initWithZkSerializer();
logger.info("read schema in: " + zkServer + " " + avroPath);
List<String> avroIds = zookeeperTemplate.getChildren(avroPath);
logger.info("get schema size: " + avroIds.size());
for (String avroId : avroIds) {
logger.info("get schema with schemaId: " + avroId);
schemaMap.put(Integer.parseInt(avroId), (String) zookeeperTemplate.readData(avroPath + "/" + avroId));
}
}
public String getScheamById(int id) {
return schemaMap.get(id);
}
//just for test
public static void main(String[] args) {
SchemaMap map = new SchemaMap("11.11.184.183:2181", "/cnpc/schema/arvo");
String str = map.getScheamById(10000);
System.out.printf(str);
}
}
根据官方文档提示,自定义schema需要实现继承DeserializationSchema类。因此
package com.matthew.flink;
import com.alibaba.fastjson.JSONObject;
import org.apache.avro.Schema;
import org.apache.avro.generic.GenericDatumReader;
import org.apache.avro.generic.GenericRecord;
import org.apache.avro.io.BinaryDecoder;
import org.apache.avro.io.DecoderFactory;
import org.apache.flink.api.common.serialization.DeserializationSchema;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.IOException;
import java.util.Iterator;
/**
* Created by developer on 7/17/18.
*/
public class CnpcByteArrayDeserializationScheme implements DeserializationSchema<JSONObject> {
private static final Logger logger = LoggerFactory.getLogger(CnpcByteArrayDeserializationScheme.class);
private SchemaMap schemaMap;
public CnpcByteArrayDeserializationScheme(SchemaMap schemaMap) {
this.schemaMap = schemaMap;
}
@Override
public JSONObject deserialize(byte[] data) throws IOException {
System.out.println(data.length);
if (data.length > 4) {
int schemaId = littleEndianToInt(data, 0);
logger.info("data schemaId: " + schemaId);
String schemaStr = schemaMap.getScheamById(schemaId);
Schema schema = new Schema.Parser().parse(schemaStr);
final GenericDatumReader<GenericRecord> reader = new GenericDatumReader<GenericRecord>(schema);
final BinaryDecoder decoder = DecoderFactory.get().binaryDecoder(data, 4, data.length - 4, null);
try {
GenericRecord genericRecord = reader.read(null, decoder);
JSONObject record = new JSONObject();
final Iterator<Schema.Field> iterator = schema.getFields().iterator();
while (iterator.hasNext()) {
final Schema.Field field = iterator.next();
final String name = field.name();
if (field.schema().getType() == Schema.Type.STRING) {
record.put(name, genericRecord.get(name).toString());
} else {
record.put(name, genericRecord.get(name));
}
}
return record;
} catch (IOException ioe) {
throw new IOException(ioe);
}
}
return null;
}
@Override
public boolean isEndOfStream(JSONObject nextElement) {
return false;
}
@Override
public TypeInformation<JSONObject> getProducedType() {
return TypeInformation.of(new TypeHint<JSONObject>() {
@Override
public TypeInformation<JSONObject> getTypeInfo() {
return super.getTypeInfo();
}
});
}
private static int littleEndianToInt(byte[] src, int offset) {
return src[offset] & 255
| (src[offset + 1] & 255) << 8
| (src[offset + 2] & 255) << 16
| (src[offset + 3] & 255) << 24;
}
}
相应修改文章1中 kakfaConsumer类引用
// FlinkKafkaConsumer010 kafkaConsumer = new FlinkKafkaConsumer010(topics, new SimpleStringSchema(), properties);
FlinkKafkaConsumer010 kafkaConsumer = new FlinkKafkaConsumer010(topics, new CnpcByteArrayDeserializationScheme(schemaMap), properties);
直接idea 中运行,即可输出正常的json格式的日志。
(注: 此schema类为根据自己的数据业务实现)
任务2 程序打包,运行在本机的flink中。
直接mvn clean package打包为一个15k左右的jar包,运行在本机flink中,页面提示找不到所依赖的相关类库。因此本人采用assembly方式,直接讲依赖包打入jar包中。(注释,此思路后续需改进,毕竟jar包打出来后近90多MB,应该从hdfs中读取依赖jar文件,参考spark在yarn中的方式)。
pom以及assembly文件
POM添加一下内容:
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>2.5</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
<archive>
<manifest>
<mainClass>com.matthew.flink.Kafka2Es</mainClass>
</manifest>
</archive>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
assembly/src.xml 文件内容如下
<assembly xmlns="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/plugins/maven-assembly-plugin/assembly/1.1.0 http://maven.apache.org/xsd/assembly-1.1.0.xsd">
<id>jar-with-dependencies</id>
<formats>
<format>jar</format>
</formats>
<includeBaseDirectory>false</includeBaseDirectory>
<dependencySets>
<dependencySet>
<unpack>false</unpack>
<scope>runtime</scope>
</dependencySet>
</dependencySets>
<fileSets>
<fileSet>
<directory>${project.build.outputDirectory}</directory>
</fileSet>
</fileSets>
</assembly>
mvn clean package 后将 flinkProcess-1.0-SNAPSHOT-jar-with-dependencies.jar 从localhost:8081页面上传至flink 运行即可。 程序正常运行。