前期成功将数据以json形式从kafka中读取出来,此片文档记录数据根据需求录入Es。
需求: index名字: YYYYMMDD-http,YYYYMMDD-dns 其中索引根据数据中的recv_time、log_type字段确定
思路flink类似Flume,数据处理由Source、Sink作为输入输出,因此Es逻辑在自定义的EsSink中实现即可。
参考网页: https://ci.apache.org/projects/flink/flink-docs-release-1.5/dev/connectors/elasticsearch.html
直接代码:
主代码中添加EsSink输出。
package com.matthew.flink;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.shaded.jackson2.org.yaml.snakeyaml.Yaml;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.elasticsearch5.ElasticsearchSink;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.util.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.io.InputStream;
import java.util.*;
/**
* Created by developer on 7/16/18.
*/
public class Kafka2Es {
private static final Logger logger = LoggerFactory.getLogger(Kafka2Es.class);
public static void main(String[] args) {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
InputStream inputStream = Thread.currentThread().getContextClassLoader().getResourceAsStream("config.yml");
Map<String, String> config = new Yaml().loadAs(inputStream, Map.class);
IOUtils.closeStream(inputStream);
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", config.get("bootstrapServer"));
properties.setProperty("zookeeper.connect", config.get("zookeeperConnect"));
properties.setProperty("group.id", config.get("group.id"));
List<String> topics = Arrays.asList(config.get("topics").split(","));
// List<String> topics = Arrays.asList(args);
SchemaMap schemaMap = new SchemaMap(config.get("zookeeperConnect"), config.get("avroPath"));
FlinkKafkaConsumer010 kafkaConsumer = new FlinkKafkaConsumer010(topics, new CnpcByteArrayDeserializationScheme(schemaMap), properties);
ElasticsearchSink elasticsearchSink = FlinkElastic.getElasticsearchSink(config.get("esTransports"), config.get("esClusterName"));
DataStream<JSONObject> stream = env.addSource(kafkaConsumer).name("KafkaConsumer");
stream.addSink(elasticsearchSink).name("Es");
try {
String t = "";
for (String topic : topics) {
t = t + topic +";";
}
env.execute("Kafka2Es-" + t.substring(0,t.length()-1));
} catch (Exception e) {
logger.error(e.getLocalizedMessage());
}
}
}
自定义EsSink实现类
package com.matthew.flink;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction;
import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer;
import org.apache.flink.streaming.connectors.elasticsearch5.ElasticsearchSink;
import org.apache.flink.streaming.connectors.elasticsearch5.shaded.org.joda.time.DateTime;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Requests;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* Created by developer on 7/17/18.
*/
public class FlinkElastic {
public static ElasticsearchSink getElasticsearchSink(String esTransPorts, String clusterName) {
ElasticsearchSink esSink = null;
Map<String, String> config = new HashMap<String, String>();
// this instructs the sink to emit after every element, otherwise they would be buffered
config.put(ElasticsearchSink.CONFIG_KEY_BULK_FLUSH_MAX_ACTIONS, "3000");
config.put(ElasticsearchSink.CONFIG_KEY_BULK_FLUSH_INTERVAL_MS, "1");
config.put("cluster.name", clusterName);
try {
List<InetSocketAddress> transports = new ArrayList<InetSocketAddress>();
// port is 9300 for elastic transportClient
// transports.add(new InetSocketAddress("11.11.184.182", 9300));
for (String s : esTransPorts.split(",")) {
String[] transPort = s.split(":");
transports.add(new InetSocketAddress(transPort[0], Integer.parseInt(transPort[1])));
}
ElasticsearchSinkFunction<JSONObject> indexLog = new ElasticsearchSinkFunction<JSONObject>() {
public IndexRequest createIndexRequest(JSONObject elements) {
String log_type = elements.getString("log_type");
final DateTime dateTime = new DateTime(elements.getLongValue("recv_time"));
String indexPrefix = dateTime.toString("yyyyMMdd");
return Requests.indexRequest().index(indexPrefix + log_type).type(log_type).source(elements);
}
@Override
public void process(JSONObject s, RuntimeContext runtimeContext, RequestIndexer requestIndexer) {
requestIndexer.add(createIndexRequest(s));
}
};
esSink = new ElasticsearchSink(config, transports, indexLog);
} catch (Exception e) {
e.printStackTrace();
}
return esSink;
}
}
运行即可,可以在es-head插件中查看数据。
遇到问题:
1.官方文档代码配置没有提示配置Es的cluster name,导致程序发现不了es的节点,需要在官方文档的基础上配置Es集群名称。2.写入数据之前记得提前在Es中建立好数据的mapping文件,或在程序中添加检测机制。