Flink入门(五) 实时流Join ElasticSearch6维度表

需求

实时流需要和维护表Join做属性的扩展.
Spark-Streaming可以 stream join hive表.
flink没发现这个功能,所以将维度表放在ES上.


maven依赖

  <properties>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
        <flink.version>1.6.2</flink.version>
        <fastjson.version>1.2.47</fastjson.version>
        <elasticsearch.version>6.3.0</elasticsearch.version>
        <guava.version>25.1-jre</guava.version>
    </properties>
     ...

        <dependency>
            <groupId>com.alibaba</groupId>
            <artifactId>fastjson</artifactId>
            <version>${fastjson.version}</version>
        </dependency>
        
        <dependency>
            <groupId>org.elasticsearch.client</groupId>
            <artifactId>elasticsearch-rest-high-level-client</artifactId>
            <version>${elasticsearch.version}</version>
        </dependency>

        <dependency>
            <groupId>com.google.guava</groupId>
            <artifactId>guava</artifactId>
            <version>${guava.version}</version>
        </dependency>
        

说明:因为根据关键字读取ES表的数据,且采用guava做缓存,减少多次拉取ES的次数.
新建类AsyncEsDataRequest继承RichAsyncFunction类.


package com.tc.flink.demo.es;

import com.alibaba.fastjson.JSONObject;
import com.google.common.cache.Cache;
import com.google.common.cache.CacheBuilder;
import com.google.common.cache.RemovalListener;
import com.google.common.cache.RemovalNotification;
import com.tc.flink.util.CommonUtil;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.async.ResultFuture;
import org.apache.flink.streaming.api.functions.async.RichAsyncFunction;
import org.elasticsearch.action.ActionListener;
import org.elasticsearch.action.search.SearchRequest;
import org.elasticsearch.action.search.SearchResponse;
import org.elasticsearch.client.RestHighLevelClient;
import org.elasticsearch.index.query.QueryBuilder;
import org.elasticsearch.index.query.QueryBuilders;
import org.elasticsearch.search.SearchHit;
import org.elasticsearch.search.builder.SearchSourceBuilder;

import java.io.IOException;
import java.util.Collections;
import java.util.concurrent.CompletableFuture;
import java.util.concurrent.TimeUnit;
import java.util.function.Supplier;

public class AsyncEsDataRequest extends RichAsyncFunction<Tuple2<Tuple2<String, String>, Integer>, Tuple2<Tuple2<String, String>, String>> {

    private transient RestHighLevelClient restHighLevelClient;

    private transient volatile Cache<Tuple2<String, String>, String> cityPercent;

    @Override
    public void open(Configuration parameters) throws Exception {
        //初始化ElasticSearch-Client
        restHighLevelClient = CommonUtil.getRestHighLevelClient();
        //缓存设置
        cityPercent = CacheBuilder.<Tuple2<String, String>, String> newBuilder().maximumSize(10).expireAfterWrite(5, TimeUnit.MINUTES)
                .removalListener(
                        //生成环境,可以注销,这个是测试观察缓存使用
                        new RemovalListener<Object, Object>() {
                            @Override
                            public void onRemoval(RemovalNotification<Object, Object> notification) {
                                System.out.println(notification.getKey() + " wa remove,cause is:" + notification.getCause());
                            }
                        }
                ).build();
    }

    @Override
    public void close() throws Exception {
        restHighLevelClient.close();
    }


    @Override
    public void asyncInvoke(Tuple2<Tuple2<String, String>, Integer> input, ResultFuture<Tuple2<Tuple2<String, String>, String>> resultFuture) throws Exception {
        Tuple2<String, String> fromToCity = input.f0;
        //若缓存里存在,直接从缓存里读取key
        String stationPercent = cityPercent.getIfPresent(fromToCity);
        if (stationPercent != null) {
            System.out.println("get data from the cache :" + stationPercent);
            resultFuture.complete(Collections.singleton(new Tuple2<Tuple2<String, String>, String>(input.f0, stationPercent)));
        } else {
            search(input, resultFuture);
        }

    }
    //异步去读Es表
    private void search(Tuple2<Tuple2<String, String>, Integer> input, ResultFuture<Tuple2<Tuple2<String, String>, String>> resultFuture) {
        SearchRequest searchRequest = new SearchRequest("posts");
        searchRequest.indices("trafficwisdom.train_section_percent");
        String fromCity = input.f0.f0;
        String toCity = input.f0.f1;
        QueryBuilder builder = QueryBuilders.boolQuery().must(QueryBuilders.termQuery("from_city", fromCity)).must(QueryBuilders.termQuery("to_city", toCity));
        SearchSourceBuilder sourceBuilder = new SearchSourceBuilder();
        sourceBuilder.query(builder);
        searchRequest.source(sourceBuilder);
        ActionListener<SearchResponse> listener = new ActionListener<SearchResponse>() {
            //成功
            @Override
            public void onResponse(SearchResponse searchResponse) {
                String stationPercent = null;
                SearchHit[] searchHits = searchResponse.getHits().getHits();
                if (searchHits.length > 0) {
                    JSONObject jsonObject = JSONObject.parseObject(searchHits[0].getSourceAsString());
                    stationPercent = jsonObject.getString("section_search_percent");
                    cityPercent.put(input.f0, stationPercent);
                }
                System.out.println("get data from the es :" + stationPercent);
                resultFuture.complete(Collections.singleton(new Tuple2<Tuple2<String, String>, String>(input.f0, stationPercent)));
            }

            //失败
            @Override
            public void onFailure(Exception e) {
                resultFuture.complete(Collections.singleton(new Tuple2<Tuple2<String, String>, String>(input.f0, null)));
            }
        };
        restHighLevelClient.searchAsync(searchRequest, listener);
    }
}

主程序

package com.tc.flink.demo.es;

import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import com.tc.flink.conf.KafkaConfig;
import org.apache.flink.api.common.functions.JoinFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.AsyncDataStream;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011;

import java.util.Properties;
import java.util.concurrent.TimeUnit;


public class StreamJoinStaticData {

    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment envStream = StreamExecutionEnvironment.createLocalEnvironment();
        envStream.setStreamTimeCharacteristic(TimeCharacteristic.IngestionTime);
        Properties propsConsumer = new Properties();
        propsConsumer.setProperty("bootstrap.servers", KafkaConfig.KAFKA_BROKER_LIST);
        propsConsumer.setProperty("group.id", "test");
        FlinkKafkaConsumer011<String> consumer = new FlinkKafkaConsumer011<String>("topic-test", new SimpleStringSchema(), propsConsumer);
        consumer.setStartFromLatest();
        DataStream<String> stream = envStream.addSource(consumer).setParallelism(2);
        stream.print();
        DataStream<Tuple2<Tuple2<String, String>, Integer>> tuple2Stream = stream.map(s -> {
            JSONObject jsonObject = JSON.parseObject(s);
            String fromCity = jsonObject.getString("fromCity");
            String toCity = jsonObject.getString("toCity");
            Integer ticketNum = jsonObject.getInteger("ticketNum");
            return Tuple2.of(Tuple2.of(fromCity, toCity), ticketNum);
        }).returns(Types.TUPLE(Types.TUPLE(Types.STRING, Types.STRING), Types.INT));

        //超时时间设置长一些,要不容易报错.,这步相当于建立一个读Es表的流
        DataStream<Tuple2<Tuple2<String, String>, String>> dimTable = AsyncDataStream.unorderedWait(tuple2Stream, new AsyncEsDataRequest(), 2, TimeUnit.SECONDS, 100);
        //实时流Join Es表的流
        DataStream<Tuple3<Tuple2<String, String>, String, Integer>> finalResult = tuple2Stream.join(dimTable).where(new FirstKeySelector()).equalTo(new SecondKeySelector())
                .window(TumblingEventTimeWindows.of(Time.milliseconds(1000))).apply(
                        new JoinFunction<Tuple2<Tuple2<String, String>, Integer>, Tuple2<Tuple2<String, String>, String>, Tuple3<Tuple2<String, String>, String, Integer>>() {
                            @Override
                            public Tuple3<Tuple2<String, String>, String, Integer> join(Tuple2<Tuple2<String, String>, Integer> first, Tuple2<Tuple2<String, String>, String> second) throws Exception {
                                return Tuple3.of(first.f0, second.f1, first.f1);
                            }
                        }
                );

        finalResult.print();

        envStream.execute("this-test");
    }

    private static class FirstKeySelector implements KeySelector<Tuple2<Tuple2<String, String>, Integer>, Tuple2<String, String>> {
        @Override
        public Tuple2<String, String> getKey(Tuple2<Tuple2<String, String>, Integer> value) throws Exception {
            return value.f0;
        }
    }

    private static class SecondKeySelector implements KeySelector<Tuple2<Tuple2<String, String>, String>, Tuple2<String, String>> {
        @Override
        public Tuple2<String, String> getKey(Tuple2<Tuple2<String, String>, String> value) throws Exception {
            return value.f0;
        }
    }

}

这就是完成实时流与ElasticSearch表维度表join的实时流,
且对ES流做了guava的缓存,减少对ES多次读取.

  • 2
    点赞
  • 17
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值