1 pom
<dependencies>
<!-- Flink依赖的java语言环境 -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- hadoop开发环境 start -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>${hadoop.version}</version>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-hdfs</artifactId>
<version>${hadoop.version}</version>
<!-- 除去hadoop-hdfs包中xml-apis的 -->
<exclusions>
<exclusion>
<groupId>xml-apis</groupId>
<artifactId>xml-apis</artifactId>
</exclusion>
</exclusions>
</dependency>
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-client</artifactId>
<version>${hadoop.version}</version>
</dependency>
<!-- hadoop开发环境 end -->
<!-- hive开发环境 -->
<dependency>
<groupId>org.apache.hive</groupId>
<artifactId>hive-jdbc</artifactId>
<version>${hive.version}</version>
<exclusions>
<exclusion>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-1.2-api</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-web</artifactId>
</exclusion>
<exclusion>
<groupId>org.apache.logging.log4j</groupId>
<artifactId>log4j-slf4j-impl</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- json解析 -->
<dependency>
<groupId>org.json</groupId>
<artifactId>json</artifactId>
<version>${json.version}</version>
</dependency>
<!-- hbase开发环境 start-->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
</dependency>
<!--phoenix -->
<dependency>
<groupId>org.apache.phoenix</groupId>
<artifactId>phoenix-core</artifactId>
<version>${phoenix.version}</version>
<exclusions>
<exclusion>
<groupId>org.glassfish</groupId>
<artifactId>javax.el</artifactId>
</exclusion>
</exclusions>
</dependency>
<!-- flink 开发环境 start -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-filesystem_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.11_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-hbase_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-statebackend-rocksdb_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime-web_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- flink 开发环境 end -->
<!-- kafka开发环境客户端 -->
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>${kafka.version}</version>
</dependency>
<!-- lombok -->
<dependency>
代码
// 本地访问hdfs设置系统属性
System.setProperty("HADOOP_USER_NAME", "root");
// 1.创建flink的流式环境、设置任务的检查点(数据存储hdfs)、设置分区发现、设置任务的重启策略、数据积压内部解决策略
StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment();
// 1.1.设置流式数据的参照时间 ProcessingTime:事件被处理时机器的系统时间;IngestionTime:事件进入flink事件;EventTime:事件数据中某个字段的时间
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
// 1.2.checkpoint配置 开启检查点,设置时间间隔为5分钟
env.enableCheckpointing(300000);
// 1.3.检查点Model设置 exactly once 仅消费一次 保证消息不丢失不重复
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
//1.4 防止checkpoint 太过于频繁而导致业务处理的速度下降
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(5000);
//1.5 设置checkpoint的超时时间
env.getCheckpointConfig().setCheckpointTimeout(20000);
//1.6 设置checkpoint最大的尝试次数,次数必须 >= 1
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
//1.7 设置取消任务时保留checkpoint,checkpoint默认会在整个job取消时被删除
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
//1.8 设置执行job过程中,保存检查点错误时,job不失败
env.getCheckpointConfig().setFailOnCheckpointingErrors(false);
String hdfsUri = ConfigLoader.getProperty("hdfsUri");
// 1.9.设置检查点存储的位置,使用RocksDBStateBackend,存储在hdfs分布式文件系统中,增量检查点
try {
env.setStateBackend(new RocksDBStateBackend(hdfsUri + "/flink/checkpoint/KafkaSourceDataTask"));
} catch (IOException e) {
e.printStackTrace();
}
//
// 重启策略
env.setRestartStrategy(RestartStrategies.noRestart());
// 故障率重启策略
// env.setRestartStrategy(RestartStrategies.failureRateRestart(3, Time.minutes(5), Time.minutes(10)));
// 设置重启策略为延迟重启策略
// env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3, 600000));
// 2.创建flink消费kafka数据的对象,伴随着kafka消费者的属性
String topic = ConfigLoader.getPropety("kafka.topic");
String brokeServers = ConfigLoader.getPropety("bootstrap.servers");
Properties prop = new Properties();
prop.setProperty("bootstrap.servers", brokeServers);
prop.setProperty("group.id", "KafkaSourceDataTask");
/** 设置kafka分区感知 间隔时间30秒*/
prop.setProperty("flink.partition-discovery.interval-millis", "30000");
// 设置key的反序列化对象
prop.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
// 设置value发序列化对象
prop.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
// 设置自动提交offset策略为 earliest
prop.setProperty("auto.offset.reset", "earliest");
FlinkKafkaConsumer011<String> kafkaConsumer = new FlinkKafkaConsumer011<String>(topic, new SimpleStringSchema(), prop);
// 设置消费数据时根据指定消费组的offset进行消费
kafkaConsumer.setStartFromGroupOffsets();
// 设置自动提交offset保存到检查点
kafkaConsumer.setCommitOffsetsOnCheckpoints(true);