0.3 Flink--Kafka2Es之写入Es

最新推荐文章于 2024-05-05 22:17:24 发布

matthew-yang

最新推荐文章于 2024-05-05 22:17:24 发布

阅读量3k

点赞数

分类专栏： BigData 文章标签： Flink Elasticsearch

本文链接：https://blog.csdn.net/weixin_40251395/article/details/81181596

版权

BigData 专栏收录该内容

2 篇文章 0 订阅

订阅专栏

前期成功将数据以json形式从kafka中读取出来，此片文档记录数据根据需求录入Es。

需求： index名字： YYYYMMDD-http,YYYYMMDD-dns 其中索引根据数据中的recv_time、log_type字段确定

思路flink类似Flume，数据处理由Source、Sink作为输入输出，因此Es逻辑在自定义的EsSink中实现即可。

参考网页： https://ci.apache.org/projects/flink/flink-docs-release-1.5/dev/connectors/elasticsearch.html

直接代码：

主代码中添加EsSink输出。

package com.matthew.flink;

import com.alibaba.fastjson.JSONObject;
import org.apache.flink.shaded.jackson2.org.yaml.snakeyaml.Yaml;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.elasticsearch5.ElasticsearchSink;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.util.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.InputStream;
import java.util.*;

/**
 * Created by developer on 7/16/18.
 */

public class Kafka2Es {

    private static final Logger logger = LoggerFactory.getLogger(Kafka2Es.class);

    public static void main(String[] args) {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();

        InputStream inputStream = Thread.currentThread().getContextClassLoader().getResourceAsStream("config.yml");
        Map<String, String> config = new Yaml().loadAs(inputStream, Map.class);
        IOUtils.closeStream(inputStream);


        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", config.get("bootstrapServer"));
        properties.setProperty("zookeeper.connect", config.get("zookeeperConnect"));
        properties.setProperty("group.id", config.get("group.id"));
        List<String> topics = Arrays.asList(config.get("topics").split(","));
//        List<String> topics = Arrays.asList(args);


        SchemaMap schemaMap = new SchemaMap(config.get("zookeeperConnect"), config.get("avroPath"));
        FlinkKafkaConsumer010 kafkaConsumer = new FlinkKafkaConsumer010(topics, new CnpcByteArrayDeserializationScheme(schemaMap), properties);
        ElasticsearchSink elasticsearchSink = FlinkElastic.getElasticsearchSink(config.get("esTransports"), config.get("esClusterName"));

        DataStream<JSONObject> stream = env.addSource(kafkaConsumer).name("KafkaConsumer");
        stream.addSink(elasticsearchSink).name("Es");
        try {
            String t = "";
            for (String topic : topics) {
                t = t + topic +";";
            }
            env.execute("Kafka2Es-" + t.substring(0,t.length()-1));
        } catch (Exception e) {
            logger.error(e.getLocalizedMessage());
        }
    }
}

自定义EsSink实现类

package com.matthew.flink;


import com.alibaba.fastjson.JSONObject;
import org.apache.flink.api.common.functions.RuntimeContext;
import org.apache.flink.streaming.connectors.elasticsearch.ElasticsearchSinkFunction;
import org.apache.flink.streaming.connectors.elasticsearch.RequestIndexer;
import org.apache.flink.streaming.connectors.elasticsearch5.ElasticsearchSink;
import org.apache.flink.streaming.connectors.elasticsearch5.shaded.org.joda.time.DateTime;
import org.elasticsearch.action.index.IndexRequest;
import org.elasticsearch.client.Requests;

import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Created by developer on 7/17/18.
 */

public class FlinkElastic {

    public static ElasticsearchSink getElasticsearchSink(String esTransPorts, String clusterName) {
        ElasticsearchSink esSink = null;
        Map<String, String> config = new HashMap<String, String>();

        // this instructs the sink to emit after every element, otherwise they would be buffered
        config.put(ElasticsearchSink.CONFIG_KEY_BULK_FLUSH_MAX_ACTIONS, "3000");
        config.put(ElasticsearchSink.CONFIG_KEY_BULK_FLUSH_INTERVAL_MS, "1");
        config.put("cluster.name", clusterName);

        try {
            List<InetSocketAddress> transports = new ArrayList<InetSocketAddress>();
            // port is 9300 for elastic transportClient
//            transports.add(new InetSocketAddress("11.11.184.182", 9300));
            for (String s : esTransPorts.split(",")) {
                String[] transPort = s.split(":");
                transports.add(new InetSocketAddress(transPort[0], Integer.parseInt(transPort[1])));
            }
            
            ElasticsearchSinkFunction<JSONObject> indexLog = new ElasticsearchSinkFunction<JSONObject>() {
                public IndexRequest createIndexRequest(JSONObject elements) {
                    String log_type = elements.getString("log_type");
                    final DateTime dateTime = new DateTime(elements.getLongValue("recv_time"));
                    String indexPrefix = dateTime.toString("yyyyMMdd");
                    return Requests.indexRequest().index(indexPrefix + log_type).type(log_type).source(elements);
                }

                @Override
                public void process(JSONObject s, RuntimeContext runtimeContext, RequestIndexer requestIndexer) {
                    requestIndexer.add(createIndexRequest(s));

                }
            };
            esSink = new ElasticsearchSink(config, transports, indexLog);
        } catch (Exception e) {
            e.printStackTrace();
        }
        return esSink;
    }
}

运行即可，可以在es-head插件中查看数据。

遇到问题：

1.官方文档代码配置没有提示配置Es的cluster name，导致程序发现不了es的节点，需要在官方文档的基础上配置Es集群名称。2.写入数据之前记得提前在Es中建立好数据的mapping文件，或在程序中添加检测机制。

matthew-yang

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
2
评论
0.3 Flink--Kafka2Es之写入Es

前期成功将数据以json形式从kafka中读取出来，此片文档记录数据根据需求录入Es。需求： index名字： YYYYMMDD-http,YYYYMMDD-dns 其中索引根据数据中的recv_time、log_type字段确定思路flink类似Flume，数据处理由Source、Sink作为输入输出，因此Es逻辑在自定义的EsSink中实现即可。参考网页： https://ci.a...
复制链接

扫一扫