基于Flink&Kafka完成的流式模块

原文地址:https://program-park.github.io/2021/06/01/flink_7/

没错,这是基于下面几篇博客完成的功能:
Flink基于MapState实时去重
Flink接收kafka source数据
Java通过UDP端口发送数据
我也是在边开发边写博客呀(心累),这篇就是之前说的“flink对接kafka去重加细节优化”的博客,加入了优化部分,解决了kafka丢失数据的问题,当然UDP也会丢少量数据,大概不到千分之一吧,业务不允许的可以通过TCP发送,代码我也会加到下面。
等会看到那详细的注释你们就明白我有多细心了(嘿嘿),给个关注我就不心累了-。-

不多哔哔了,上代码:

package com.distinct

import java.util.Properties

import com.utils.SocketUDPClient
import org.apache.flink.api.common.restartstrategy.RestartStrategies
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.{MapState, MapStateDescriptor, StateTtlConfig}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.api.common.time.Time
import org.apache.flink.runtime.state.filesystem.FsStateBackend
import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.environment.CheckpointConfig
import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.util.Collector

object Kafka2flink {

  def main(args: Array[String]): Unit = {

//    创建流处理环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
//    开启CheckPointing
    env.enableCheckpointing(100000)
//    设置重启,出现异常重启3次,隔5秒一次(默认固定延迟无限重启)
    env.getConfig.setRestartStrategy(RestartStrategies.fixedDelayRestart(2, Time.seconds(2)))
//    checkpoint最小间隔(确保检查点之间有至少1000 ms的间隔)
    env.getCheckpointConfig.setMinPauseBetweenCheckpoints(1000)
//    checkpoint的超时时间(检查点必须在2分钟内完成,否则被丢弃)
    env.getCheckpointConfig.setCheckpointTimeout(120000)
//    同一时间只允许进行一个检查点
    env.getCheckpointConfig.setMaxConcurrentCheckpoints(1)
//    设置statebackend
    val stateBackend = new FsStateBackend("file:///ssd/flink2kafka/flink-1.12.4/data")
    env.setStateBackend(stateBackend)
//    一旦Flink处理程序被cancel后,会保留Checkpoint数据,以便根据实际需要恢复到指定的Checkpoint
    env.getCheckpointConfig.enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
//    设置Checkpoint模式(与kafka整合,设置Checkpoint模式为exactly-once)
    env.getCheckpointConfig.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)

//    Kafka props
    val properties = new Properties()
//    指定Kafka的Broker地址
    properties.setProperty("bootstrap.servers", "10.7.2.20:9092")
    properties.setProperty("group.id", "consumer-group")
    properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
    properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
    properties.setProperty("auto.offset.reset", "latest")

//    Checkpoint成功后,还会向Kafka特殊的topic中写偏移量(此处不建议改为false)
//    设置为false后,则不会向特殊topic中写偏移量
//    KafkaSource.setCommitOffsetsOnCheckpoints(false)

//    创建Kafka DataStream        setStartFromGroupOffsets():从上次消费位点开始消费
    val stream = env.addSource(new FlinkKafkaConsumer[String]("test", new SimpleStringSchema(), properties).setStartFromGroupOffsets())

//    val stream: DataStream[String] = env.socketTextStream("10.7.2.20", 9999)
    val streamdata = stream.map(x => {
      (x, "1")
    })
      .keyBy(_._1)
      .process(new Deduplicator())

//    streamdata
//      .print()

//    通过TCP端口发送到第三方
    streamdata.map(data => {data + "\n"}).writeToSocket("10.7.2.21", 6666, new SimpleStringSchema())
//    自定义sink 通过UDP端口发送到第三方
//    streamdata.addSink(new udpSink())

//    启动executor,执行任务
    env.execute("Socket stream word count")
  }

}
class udpSink() extends RichSinkFunction[String](){

  var socketUDPClient: SocketUDPClient = _

  override def open(parameters: Configuration): Unit = {
    socketUDPClient = new SocketUDPClient("10.7.2.21", 6666)
  }

  override def invoke(value: String, context: SinkFunction.Context): Unit = {
    socketUDPClient.send(value)
  }

  override def close(): Unit = {
    socketUDPClient.close()
  }
}

class Deduplicator extends KeyedProcessFunction[String, (String, String), String]() {

  var state: MapState[String, String] = _
  val dec = new MapStateDescriptor[String, String]("state", classOf[String], classOf[String])

  override def open(parameters: Configuration): Unit = {

    val ttlConfig = StateTtlConfig
      .newBuilder(Time.seconds(3)) //这是state存活时间10s
      .setUpdateType(StateTtlConfig.UpdateType.OnReadAndWrite)//设置过期时间更新方式
      .setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired)//永远不要返回过期的状态
//      .cleanupInRocksdbCompactFilter(5)//处理完1000个状态查询时候,会启用一次CompactFilter
      .build
    dec.enableTimeToLive(ttlConfig)

    val context = getRuntimeContext
    context.getMapState(dec)
    state = context.getMapState(new MapStateDescriptor[String, String]("state", classOf[String], classOf[String]))
  }

  override def processElement(i: (String, String), context: KeyedProcessFunction[String, (String, String), String]#Context, collector: Collector[String]): Unit = {

    if (state.contains(i._1)) {
      state.put(i._1, "1")
    }else {
      state.put(i._1, "1")
      collector.collect(i._1)
    }
  }
}

这是UDP发送的工具类:

package com.utils;

import java.io.IOException;
import java.net.*;

public class SocketUDPClient {

    private InetAddress ip;
    private int port;
    private DatagramSocket socket;

    public SocketUDPClient(String ip, int port) throws UnknownHostException, SocketException {
        this.ip = InetAddress.getByName(ip);
        this.port = port;
//        创建一个UDP套接字,与本地任意一个未使用的UDP端口绑定
        socket = new DatagramSocket();
//        与本地一个固定的UDP端口绑定
//        socket=new DatagramSocket(9000);
    }

    public void send(String data){
        try {
            //先准备一个待发送的数据报
            byte[] outputData=data.getBytes();
            //构建一个数据报文。
            DatagramPacket outputPacket=new DatagramPacket(outputData, outputData.length, ip, port);
            //给EchoUDPServer发送数据报
            socket.send(outputPacket);  //给EchoUDPServer发送数据报
        } catch (IOException ex) { }
    }

    public void close(){
        if (socket != null)
            socket.close();//释放本地端口
    }

}

哦对,还有pom文件(我可真细心)

<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
    <modelVersion>4.0.0</modelVersion>

    <groupId>org.example</groupId>
    <artifactId>flink2kafka</artifactId>
    <version>1.0-SNAPSHOT</version>

    <properties>
        <java.version>1.8</java.version>
        <scala.version>2.12</scala.version>
        <flink.version>1.12.4</flink.version>
    </properties>
    <dependencies>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-clients_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-scala_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-java</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-scala_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-streaming-java_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>

        <!-- flink连接器-->
        <dependency>
            <groupId>org.apache.flink</groupId>
            <artifactId>flink-connector-kafka_2.12</artifactId>
            <version>${flink.version}</version>
        </dependency>

    </dependencies>

    <build>
        <plugins>
            <!-- 该插件用于将Scala代码编译成class文件 -->
            <plugin>
                <groupId>net.alchim31.maven</groupId>
                <artifactId>scala-maven-plugin</artifactId>
                <version>3.2.2</version>
                <executions>
                    <execution>
                        <!-- 声明绑定到maven的compile阶段 -->
                        <goals>
                            <goal>compile</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
            <plugin>
                <groupId>org.apache.maven.plugins</groupId>
                <artifactId>maven-assembly-plugin</artifactId>
                <version>3.0.0</version>
                <configuration>
                    <descriptorRefs>
                        <descriptorRef>jar-with-dependencies</descriptorRef>
                    </descriptorRefs>
                </configuration>
                <executions>
                    <execution>
                        <id>make-assembly</id>
                        <phase>package</phase>
                        <goals>
                            <goal>single</goal>
                        </goals>
                    </execution>
                </executions>
            </plugin>
        </plugins>
    </build>
</project>

喜欢的小伙伴给个关注吧~~~

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

大Null

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值