以下是我在公司项目上写的一些代码,删去了业务逻辑后的通用内容
后续会再分享一些其他flink的链路
首先肯定要先导入maven依赖
我的依赖如下
<properties>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<encoding>UTF-8</encoding>
<scala.version>2.11.8</scala.version>
<scala.binary.version>2.11</scala.binary.version>
<hadoop.version>3.0.0-cdh6.3.0</hadoop.version>
<flink.version>1.12.0</flink.version>
<kafka.version>1.1.1</kafka.version>
<hive.version>2.1.1-cdh6.3.0</hive.version>
<hbase.version>1.2.0</hbase.version>
<mysql.connector.version>5.1.40</mysql.connector.version>
<kudu.version>1.10.0</kudu.version>
</properties>
<profiles>
<profile>
<id>dev</id>
<activation>
<activeByDefault>true</activeByDefault>
</activation>
<properties>
<maven.dependency.scope>compile</maven.dependency.scope>
</properties>
</profile>
<profile>
<id>prod</id>
<properties>
<maven.dependency.scope>provided</maven.dependency.scope>
</properties>
</profile>
</profiles>
<repositories>
<repository>
<id>apache.snapshots</id>
<name>Apache Development Snapshot Repository</name>
<url>https://repository.apache.org/content/repositories/snapshots/</url>
<releases>
<enabled>false</enabled>
</releases>
<snapshots>
<enabled>true</enabled>
</snapshots>
</repository>
</repositories>
<dependencies>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-connector-filesystem -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-filesystem_2.12</artifactId>
<version>1.11.3</version>
</dependency>
<!-- https://mvnrepository.com/artifact/com.alibaba/fastjson -->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version>1.2.62</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-avro -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-avro</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-avro-confluent-registry</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.kudu/kudu-client -->
<dependency>
<groupId>org.apache.kudu</groupId>
<artifactId>kudu-client</artifactId>
<version>${kudu.version}</version>
<!--<scope>test</scope>-->
</dependency>
<dependency>
<groupId>com.google.protobuf</groupId>
<artifactId>protobuf-java</artifactId>
<version>2.6.0</version>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-protocol</artifactId>
<version>${hbase.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-client</artifactId>
<version>${hbase.version}</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hbase/hbase-common -->
<dependency>
<groupId>org.apache.hbase</groupId>
<artifactId>hbase-common</artifactId>
<version>${hbase.version}</version>
<scope>provided</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-common -->
<dependency>
<groupId>org.apache.hadoop</groupId>
<artifactId>hadoop-common</artifactId>
<version>3.0.0</version>
<scope>provided</scope>
</dependency>
<!-- Apache Flink dependencies -->
<!-- These dependencies are provided, because they should not be packaged into the JAR file. -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
<scope>${maven.dependency.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>${maven.dependency.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-core</artifactId>
<version>${flink.version}</version>
<scope>${maven.dependency.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-runtime_2.11</artifactId>
<version>${flink.version}</version>
<scope>${maven.dependency.scope}</scope>
</dependency>
<!-- Kafka -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.kafka</groupId>
<artifactId>kafka-clients</artifactId>
<version>${kafka.version}</version>
<scope>${maven.dependency.scope}</scope>
</dependency>
<!-- Hive -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-hive_2.11</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<!--读写hdfs-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-files</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- flink table -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java</artifactId>
<version>${flink.version}</version>
<scope>${maven.dependency.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-planner-blink_2.11</artifactId>
<version>${flink.version}</version>
<scope>${maven.dependency.scope}</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-api-java-bridge_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table-common</artifactId>
<version>${flink.version}</version>
<scope>${maven.dependency.scope}</scope>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.7</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>commons-logging</groupId>
<artifactId>commons-logging-api</artifactId>
<version>1.1</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
<scope>runtime</scope>
</dependency>
</dependencies>
然后在build里的个性化设置就不粘贴了
接下来是主类,整个主类的传参只需要一个配置文件,然后根据代码内容把对应的配置项写到配置文件即可
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.serialization.SimpleStringEncoder;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.file.sink.FileSink;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.sink.filesystem.OutputFileConfig;
import org.apache.flink.streaming.api.functions.sink.filesystem.bucketassigners.DateTimeBucketAssigner;
import org.apache.flink.streaming.api.functions.sink.filesystem.rollingpolicies.DefaultRollingPolicy;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.Collector;
import org.apache.flink.core.fs.Path;
import java.io.BufferedReader;
import java.io.FileReader;
import java.math.BigInteger;
import java.util.Properties;
import java.util.concurrent.TimeUnit;
public class KafkaToHdfs {
public static void main(String[] args) throws Exception{
//传参
String config_path = args[0];
//获取配置文件
Properties properties = new Properties();
BufferedReader bufferedReader = new BufferedReader(new FileReader(config_path));
properties.load(bufferedReader);
//解析配置文件
String topic_name = properties.getProperty("kakfa.topic.name");
String group_name = properties.getProperty("kakfa.group.name");
String kafka_ips = properties.getProperty("kakfa.ips");
String out_path = properties.getProperty("hdfs.outpath");
String check_path = properties.getProperty("hdfs.checkpoint.path");
String job_name = properties.getProperty("flink.job.name");
String head_name = properties.getProperty("file.header.name");
//设置FLINK环境
StreamExecutionEnvironment env = FlinkEnvUtils.creatEnv(check_path);
//创建kafka环境
Properties props = new Properties();
props.setProperty("bootstrap.servers", kafka_ips);
props.setProperty("group.id", group_name);
FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<>(topic_name, new SimpleStringSchema(), props);
consumer.setCommitOffsetsOnCheckpoints(true);
consumer.setStartFromGroupOffsets();
//创建流
DataStream<String> stream = env.addSource(consumer);
//设置文件格式
OutputFileConfig config = OutputFileConfig
.builder()
.withPartPrefix(head_name)
.withPartSuffix(".dat")
.build();
//设置时间格式
DateTimeBucketAssigner dateTimeBucketAssigner = new DateTimeBucketAssigner("yyyyMMddHH");
//设置文件生成
FileSink<String> sink = FileSink
.forRowFormat(new Path(out_path), new SimpleStringEncoder<String>("UTF-8"))
.withBucketAssigner(dateTimeBucketAssigner)
.withRollingPolicy(
DefaultRollingPolicy.builder()
.withRolloverInterval(TimeUnit.MINUTES.toMillis(5))
.withInactivityInterval(TimeUnit.MINUTES.toMillis(5))
.withMaxPartSize(1024*1024*1024)
.build())
.withOutputFileConfig(config)
.build();
//业务处理
stream.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String value, Collector<String> out) {
if(value!=null) {
//TODO 这里写具体的业务处理逻辑,每读取一个kafka的offset,会进行一次处理
out.collect(value);
}
}
}
}).sinkTo(sink);
//进行执行
env.execute(job_name);
}
}
FileSink可以自定义滚动策略
withRolloverInterval 包含了至少多少时间的数据量
withInactivityInterval 多久没接受到数据
withMaxPartSize 文件大小达到了多少
当满足以上三个条件的任何一个时都会将 In-progress 状态文件转化为正式文件
其中FlinkEnvUtils.creatEnv方法的代码如下,是一些配置项
public static StreamExecutionEnvironment creatEnv(String check_path){
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(5*60*1000L);
env.setStateBackend(new FsStateBackend(check_path));
env.getCheckpointConfig().setCheckpointInterval(5*60*1000L);
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointTimeout(5*60000L);
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
return env;
}