简介
快速搭建Flink消费kafka的模板代码。
开始
父pom
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<flink.version>1.13.6</flink.version>
<java.version>1.8</java.version>
<scala.binary.version>2.12</scala.binary.version>
<slf4j.version>1.7.30</slf4j.version>
<hive.version>3.1.2</hive.version>
</properties>
<dependencyManagement>
<!-- 引入 Flink 相关依赖-->
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- Flink Kafka -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-base</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- 引入日志管理相关依赖-->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>${slf4j.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.62</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.24</version>
</dependency>
<!-- Flink to kafka 的格式装换,如果不添加在flinktable写入kafka的时候会报错-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-json</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- 状态后端-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-statebackend-rocksdb_2.11</artifactId>
<version>1.13.6</version>
</dependency>
<!-- 使用布隆过滤器-->
<dependency>
<groupId>com.clearspring.analytics</groupId>
<artifactId>stream</artifactId>
<version>2.7.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime-web_${scala.binary.version}</artifactId>
<version>1.12.2</version>
</dependency>
</dependencies>
</dependencyManagement>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>8</source>
<target>8</target>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.0.0</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
<repositories>
<repository>
<id>nexus-aliyun</id>
<name>nexus-aliyun</name>
<url>http://maven.aliyun.com/nexus/content/groups/public/</url>
<releases>
<enabled>true</enabled>
</releases>
<snapshots>
<enabled>false</enabled>
</snapshots>
</repository>
</repositories>
子pom
<name>bigdata</name>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-base</artifactId>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-statebackend-rocksdb_2.11</artifactId>
</dependency>
<!-- flink本地ui-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime-web_2.12</artifactId>
</dependency>
<!-- 引入日志管理相关依赖-->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
</dependency>
<dependency>
<groupId>com.clearspring.analytics</groupId>
<artifactId>stream</artifactId>
</dependency>
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
</dependency>
</dependencies>
log4j.properties
log4j.rootLogger=ERROR, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
CommonConfig
public class CommonConfig {
/**
* kafka服务地址
*/
public final static String BOOTSTRAP_SERVERS="";
/**
* 目标topic
*/
public final static String SOURCE_TOPIC="";
public final static String TARGET_TOPIC="";
/**
* 应用名称
*/
public final static String APP_NAME="";
/**
* 状态后端保存地址
*/
public final static String STATE_BACKEND_PATH="hdfs://master1:8020/checkpoint";
}
KafkaConsumerUtil
public class KafkaConsumerUtil {
static String BOOTSTRAP_SERVERS = CommonConfig.BOOTSTRAP_SERVERS;
public static FlinkKafkaConsumer<String> getKafkaConsumer(String topic, String groupId) {
Properties prop = new Properties();
prop.setProperty("bootstrap.servers", BOOTSTRAP_SERVERS);
prop.setProperty(ConsumerConfig.GROUP_ID_CONFIG, groupId);
prop.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<>(topic,
//由于默认的解码器,如果字符串为空的时候他会保存,所以自定义一个
new KafkaDeserializationSchema<String>() {
@Override
public boolean isEndOfStream(String nextElement) {
return false;
}
@Override
public String deserialize(ConsumerRecord<byte[], byte[]> record) throws Exception {
if(record == null || record.value() == null) {
return "";
}
return new String(record.value(),"UTF-8");
}
@Override
public TypeInformation<String> getProducedType() {
return BasicTypeInfo.STRING_TYPE_INFO;
}
}, prop);
return consumer;
}
}
KafkaProductUtil
public class KafkaProductUtil {
public static FlinkKafkaProducer getKafkaProduct(String targetTopic) {
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", CommonConfig.BOOTSTRAP_SERVERS);
/**
* 这里的作用是如果flink的checkpoint的时间小于事物的时间,就会失败,所以要设置时间要大于等于flink的checkpoint的时间
*/
properties.setProperty(ProducerConfig.TRANSACTION_TIMEOUT_CONFIG, 60 * 15 * 1000 + "");
KafkaSerializationSchema<String> serializationSchema = new KafkaSerializationSchema<String>() {
@Override
public ProducerRecord<byte[], byte[]> serialize(String element, @Nullable Long timestamp) {
return new ProducerRecord<>(
targetTopic, // target topic
element.getBytes(StandardCharsets.UTF_8)); // record contents
}
};
FlinkKafkaProducer<String> myProducer = new FlinkKafkaProducer<>(
targetTopic, // target topic
serializationSchema, // serialization schema
properties, // producer config
FlinkKafkaProducer.Semantic.EXACTLY_ONCE); // fault-tolerance
return myProducer;
}
}
FlinkMessageUnique
public class FlinkMessageUnique {
private final static String GROUP_ID= FlinkMessageUnique.class.getSimpleName();
public static void main(String[] args) throws Exception {
// TODO 配置本地flink ui
// Configuration configuration = new Configuration();
// configuration.setInteger(RestOptions.PORT, 8082);
// StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(configuration);
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//这里和kafka的分区保持一致
env.setParallelism(1);
// TODO 1. 状态后端设置
env.enableCheckpointing(3000L, CheckpointingMode.EXACTLY_ONCE);
//检查点超时时间
env.getCheckpointConfig().setCheckpointTimeout(60 * 1000L);
//两次检查点最小间隔时间,就是第一次检查点完成以后,最少经过3s钟开始检查点
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(3000L);
env.getCheckpointConfig().enableExternalizedCheckpoints(
// ExternalizedCheckpointCleanup用于指定当job canceled的时候externalized checkpoint该如何清理
// DELETE_ON_CANCELLATION的话,在job canceled的时候会自动删除externalized state
// RETAIN_ON_CANCELLATION则在job canceled的时候会保留externalized checkpoint state
CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION
);
env.setRestartStrategy(RestartStrategies.failureRateRestart(
// 设置任务失败重启 允许任务失败最大次数 10次
10,
// 任务失败的时间启动的间隔
Time.of(1L, TimeUnit.MINUTES),
// 允许任务延迟时间
Time.of(3L, TimeUnit.MINUTES)
));
//设置状态后端
// 此处也可以是HDFS路径,这里为了测试方便,所以使用的是本地路径
env.setStateBackend(new RocksDBStateBackend(CommonConfig.STATE_BACKEND_PATH, true));
// env.setStateBackend(new RocksDBStateBackend("hdfs://master1:8020/fink-checkpoints", true));
// env.getCheckpointConfig().setCheckpointStorage("hdfs://master1:8020/bigdata/ck");
System.setProperty("HADOOP_USER_NAME", "bigdata");
DataStreamSource<String> data = env.addSource(KafkaConsumerUtil.getKafkaConsumer(CommonConfig.SOURCE_TOPIC, GROUP_ID));
//TODO 使用布隆过滤器进行数据过滤
SingleOutputStreamOperator<String> datadata = data.keyBy(new KeySelector<String, String>() {
@Override
public String getKey(String s) throws Exception {
return "key";
}
}).process(new KeyedProcessFunction<String, String, String>() {
private transient ValueState<Integer> uniqueCount;
@Override
public void open(Configuration parameters) throws Exception {
ValueStateDescriptor<Integer> descriptor =
new ValueStateDescriptor<>(
"uniqueCount", // the state name
TypeInformation.of(new TypeHint<Integer>() {}), // type information
new Integer(0)); // default value of the state, if nothing was set
uniqueCount = getRuntimeContext().getState(descriptor);
}
@Override
public void processElement(String s, Context context, Collector<String> collector) throws Exception {
Integer count = uniqueCount.value();
if(count==0){
collector.collect(s);
uniqueCount.update(1);
}
}
//这里作用主要是方便容错
}).uid("process_unique_count");
//去重复以后写回kafka主题
datadata.addSink(KafkaProductUtil.getKafkaProduct(CommonConfig.TARGET_TOPIC));
env.execute(CommonConfig.APP_NAME);
}
}
任务提交
/home/bigdata/module/flink-1.13.6/bin/flink run \
-d \
-m yarn-cluster \
-yqu default \
-ynm 提交到yarn的名称 \
-c com.bigdata \
SNAPSHOT-jar-with-dependencies.jar
状态恢复
服务以后会重新生成一个ck的随机目录接着上一次的1560消费比如1561
/home/bigdata/module/flink-1.13.6/bin/flink run \
-d \
-s hdfs://master1:8020/checkpoint/bigdata/ck/d5e28ce894fbd7ea9d25a52c1972892d/chk-1560 \
-m yarn-cluster \
-yqu default \
-ynm 提交到yarn的名称 \
-c com.bigdata.FlinkMessageUnique \
SNAPSHOT-jar-with-dependencies.jar