Flink实践场景-通过DataStream Api统计每小时的出租车司机的收入

一、场景说明

        假设存在如下一个场景,出租车每完成一单生意会产生一笔账单,有司机想了解自己上一个小时的总收入。这个是一个抽象化的实时时间窗口统计任务,具有一定的典型性,以此我们看一下通过Flink dataStream api如何实现此逻辑。

二、场景模拟说明

        实际项目中网络出租车的订单会在接单是在数据库中插入一条数据,在完成订单时更新定完状态,包括本单收入。再通过CDC技术把订单数据实时发送到消息队列(如kafka)中,flink再消费消息队列数据,然后1个小时的时间窗口统计这个小时司机的收入总额。

        本地练习时搭建这个流程过于麻烦,为此我们简化流程,直接构建Source端产生数据,并固化产生数据的时间间隔和每单的收入金额:

        每秒中产生一条订单记录,每条记录包含ID,Timestamp,tips,其中ID为司机的ID,唯一键;timestamp为记录产生的时间,此模拟为6分钟产生一条,一个小时产生10条,ID范围为【1,5】,则每个小时每个ID产生两条;tips为收入,默认为10.00元,一个小时产生2条,即20.00元;

/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.example.trublingwindow.source;

import org.apache.flink.annotation.Public;
import org.apache.flink.streaming.api.functions.source.FromIteratorFunction;

import java.io.Serializable;
import java.util.Iterator;

/**
 * A stream of transactions.
 *
 * @deprecated This class is based on the {@link
 *     org.apache.flink.streaming.api.functions.source.SourceFunction} API, which is due to be
 *     removed. Use the new {@link org.apache.flink.api.connector.source.Source} API instead.
 */
@Public
public class TaxiFareSource extends FromIteratorFunction<TaxiFare> {

    private static final long serialVersionUID = 1L;

    public TaxiFareSource() {
        super(new RateLimitedIterator<>(TaxiFareIterator.unbounded()));
    }

    private static class RateLimitedIterator<T> implements Iterator<T>, Serializable {

        private static final long serialVersionUID = 1L;

        private final Iterator<T> inner;

        private RateLimitedIterator(Iterator<T> inner) {
            this.inner = inner;
        }

        @Override
        public boolean hasNext() {
            return inner.hasNext();
        }

        @Override
        public T next() {
            try {
                Thread.sleep(1000);
            } catch (InterruptedException e) {
                throw new RuntimeException(e);
            }
            return inner.next();
        }
    }
}


/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.example.trublingwindow.source;

import java.io.Serializable;
import java.sql.Timestamp;
import java.util.Arrays;
import java.util.Iterator;
import java.util.List;

/** An iterator of transaction events. */
final class TaxiFareIterator implements Iterator<TaxiFare>, Serializable {

    private static final long serialVersionUID = 1L;

    private static final Timestamp INITIAL_TIMESTAMP = Timestamp.valueOf("2019-01-01 00:00:00");

    private static final long SIX_MINUTES = 6 * 60 * 1000;

    private final boolean bounded;

    private int index = 0;

    private long timestamp;

    static TaxiFareIterator bounded() {
        return new TaxiFareIterator(true);
    }

    static TaxiFareIterator unbounded() {
        return new TaxiFareIterator(false);
    }

    private TaxiFareIterator(boolean bounded) {
        this.bounded = bounded;
        this.timestamp = INITIAL_TIMESTAMP.getTime();
    }

    @Override
    public boolean hasNext() {
        if (index < data.size()) {
            return true;
        } else if (!bounded) {
            index = 0;
            return true;
        } else {
            return false;
        }
    }

    @Override
    public TaxiFare next() {
        TaxiFare transaction = data.get(index++);
        transaction.setTimestamp(timestamp);
        timestamp += SIX_MINUTES;
        return transaction;
    }

    private static List<TaxiFare> data =
            Arrays.asList(
                    new TaxiFare(1, 0L, 10.00F),
                    new TaxiFare(2, 0L, 10.00F),
                    new TaxiFare(3, 0L, 10.00F),
                    new TaxiFare(4, 0L, 10.00F),
                    new TaxiFare(5, 0L, 10.00F));
//                    new TaxiFare(1, 0L, 188.23F),
//                    new TaxiFare(2, 0L, 374.79F),
//                    new TaxiFare(3, 0L, 112.15F),
//                    new TaxiFare(4, 0L, 478.75F),
//                    new TaxiFare(5, 0L, 208.85F),
//                    new TaxiFare(1, 0L, 379.64F),
//                    new TaxiFare(2, 0L, 351.44F),
//                    new TaxiFare(3, 0L, 320.75F),
//                    new TaxiFare(4, 0L, 259.42F),
//                    new TaxiFare(5, 0L, 273.44F),
//                    new TaxiFare(1, 0L, 267.25F),
//                    new TaxiFare(2, 0L, 397.15F),
//                    new TaxiFare(3, 0L, 0.219F),
//                    new TaxiFare(4, 0L, 231.94F),
//                    new TaxiFare(5, 0L, 384.73F),
//                    new TaxiFare(1, 0L, 419.62F),
//                    new TaxiFare(2, 0L, 412.91F),
//                    new TaxiFare(3, 0L, 0.77F),
//                    new TaxiFare(4, 0L, 22.10F),
//                    new TaxiFare(5, 0L, 377.54F),
//                    new TaxiFare(1, 0L, 375.44F),
//                    new TaxiFare(2, 0L, 230.18F),
//                    new TaxiFare(3, 0L, 0.80F),
//                    new TaxiFare(4, 0L, 350.89F),
//                    new TaxiFare(5, 0L, 127.55F),
//                    new TaxiFare(1, 0L, 483.91F),
//                    new TaxiFare(2, 0L, 228.22F),
//                    new TaxiFare(3, 0L, 871.15F),
//                    new TaxiFare(4, 0L, 64.19F),
//                    new TaxiFare(5, 0L, 79.43F),
//                    new TaxiFare(1, 0L, 56.12F),
//                    new TaxiFare(2, 0L, 256.48F),
//                    new TaxiFare(3, 0L, 148.16F),
//                    new TaxiFare(4, 0L, 199.95F),
//                    new TaxiFare(5, 0L, 252.37F),
//                    new TaxiFare(1, 0L, 274.73F),
//                    new TaxiFare(2, 0L, 473.54F),
//                    new TaxiFare(3, 0L, 119.92F),
//                    new TaxiFare(4, 0L, 323.59F),
//                    new TaxiFare(5, 0L, 353.16F),
//                    new TaxiFare(1, 0L, 211.90F),
//                    new TaxiFare(2, 0L, 280.93F),
//                    new TaxiFare(3, 0L, 347.89F),
//                    new TaxiFare(4, 0L, 459.86F),
//                    new TaxiFare(5, 0L, 82.31F),
//                    new TaxiFare(1, 0L, 373.26F),
//                    new TaxiFare(2, 0L, 479.83F),
//                    new TaxiFare(3, 0L, 454.25F),
//                    new TaxiFare(4, 0L, 83.64F),
//                    new TaxiFare(5, 0L, 292.44F));
}


/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package com.example.trublingwindow.source;


/** A simple transaction. */
@SuppressWarnings("unused")
public final class TaxiFare {

    private long driverId;

    private long timestamp;

    private float tips;

    public TaxiFare() {}

    public TaxiFare(long driverId, long timestamp, float tips) {
        this.driverId = driverId;
        this.timestamp = timestamp;
        this.tips = tips;
    }

    public long getDriverId() {
        return driverId;
    }

    public void setDriverId(long driverId) {
        this.driverId = driverId;
    }

    public long getTimestamp() {
        return timestamp;
    }

    public void setTimestamp(long timestamp) {
        this.timestamp = timestamp;
    }

    public float getTips() {
        return tips;
    }

    public void setTips(float tips) {
        this.tips = tips;
    }
}

        数据源构造完成了,那么计算逻辑如何实现的,核心逻辑如下:

// 计算每个司机每小时的收入总和
DataStream<Tuple3<Long, Long, Float>> hourlyTips = fares
        .keyBy((TaxiFare fare) -> fare.driverId)
        .window(TumblingEventTimeWindows.of(Time.hours(1)))
        .process(new AddTips());

即按照ID进行流分区,然后通过滚动窗口,窗口大小为1个小时;然后通过process计算一个窗口内收入的总和,最后再输出,此用例为打印结果;

三、核心代码逻辑

package com.example.trublingwindow;

import com.example.trublingwindow.source.TaxiFare;
import com.example.trublingwindow.source.TaxiFareSource;
import org.apache.flink.api.common.RuntimeExecutionMode;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.time.Duration;

public class TumblingEventTimeWindowTest {

    public static void main(String[] args) throws Exception {

        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setRuntimeMode(RuntimeExecutionMode.STREAMING);

        DataStreamSource<TaxiFare> source = env.addSource(new TaxiFareSource());

//        source.print();
//        env.execute();


        DataStream<Tuple3<Long, Long, Float>> process = source
                .assignTimestampsAndWatermarks(
                        WatermarkStrategy.<TaxiFare>forBoundedOutOfOrderness(Duration.ofMinutes(10)).
                                withTimestampAssigner(((taxiFare, recordTimestamp) -> taxiFare.getTimestamp()))
                )
                .keyBy(taxi -> taxi.getDriverId())
                .window(TumblingEventTimeWindows.of(Duration.ofHours(1)))
                .process(new ProcessWindowFunction<TaxiFare, Tuple3<Long, Long, Float>, Long, TimeWindow>() {

                    @Override
                    public void process(Long aLong, ProcessWindowFunction<TaxiFare, Tuple3<Long, Long, Float>, Long, TimeWindow>.Context context, Iterable<TaxiFare> elements, Collector<Tuple3<Long, Long, Float>> out) throws Exception {
                        Float value = 0F;
                        for (TaxiFare element : elements) {
                            value = value + element.getTips();
                        }
                        out.collect(Tuple3.of(aLong,context.window().getEnd(),value));
                    }
                });
        process.print();
        env.execute();
    }
}

        代码中补充了如下问题与解决方法:

1)Flink按照窗口计算需要有watermark推动,此场景下为时间时间,采用订单记录中的timesatmp作为事件时间;

withTimestampAssigner(((taxiFare, recordTimestamp) -> taxiFare.getTimestamp())

2)迟到的数据如何处理呢?比如10:05了来了一条9:58的数据,此需要等待延迟数据到来,但是不能无限时间等待,那将永远无法计算,因为永远不知道下一条数据是不是迟到数据,故此用例中会等待10分钟迟到数据,超出10分钟会丢弃掉

WatermarkStrategy.<TaxiFare>forBoundedOutOfOrderness(Duration.ofMinutes(10))

结果打印

1> (4,1546275599999,20.0)
11> (1,1546275599999,20.0)
16> (2,1546275599999,20.0)
15> (3,1546275599999,20.0)
16> (5,1546275599999,20.0)
1> (4,1546279199999,20.0)
15> (3,1546279199999,20.0)
16> (2,1546279199999,20.0)
11> (1,1546279199999,20.0)
16> (5,1546279199999,20.0)
1> (4,1546282799999,20.0)
16> (2,1546282799999,20.0)
15> (3,1546282799999,20.0)
16> (5,1546282799999,20.0)
11> (1,1546282799999,20.0)

四、Flink maven依赖

<properties>
        <maven.compiler.source>8</maven.compiler.source>
        <maven.compiler.target>8</maven.compiler.target>
        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
    </properties>


    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java</artifactId>
            <version>1.19.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients</artifactId>
            <version>1.19.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>1.19.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-runtime</artifactId>
            <version>1.19.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-runtime-web</artifactId>
            <version>1.19.0</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-core</artifactId>
            <version>1.19.0</version>
        </dependency>

        <dependency>
            <groupId>org.apache.hadoop</groupId>
            <artifactId>hadoop-common</artifactId>
            <version>2.10.2</version>
        </dependency>
    </dependencies>

  • 7
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值