最近在调研各种计算框架,在看完Kafka Stream之后也顺便看了一下最流行的Flink,结合我们业务场景试验了一些小demo。下面给出一个简单示例,基本和官方类似。只是使用了最新的版本Flink kafka connector以及最新版本的Flink 1.7.2(截止2019年2月)
示例简要介绍
第一步将kafka topic作为source添加到DataStream
第二步读取topic的内容进行单词统计(统计单词基本上是大数据框架的helloWorld程序)
第三步将统计结果进行转换
第四步将结果存入到kafka另外一个topic中。
关于Flink的connector以及现在=的kafkaconnector,参考https://ci.apache.org/projects/flink/flink-docs-release-1.7/dev/connectors/ 和https://ci.apache.org/projects/flink/flink-docs-release-1.7/dev/connectors/kafka.html
版本
本文试验中使用Flink 1.7.2版本,kafka_2.12-1.0.0。 因此引入了flink-connector-kafka_2.12的1.7.2版本。
由于从kafka读取的内容是string类型,向kafka写入的结果是string,因此DeserializationSchema直接使用自带的SimpleStringSchema。 关于如何使用deserializationSchema以及TypeInformationSerializationSchema 、JsonDeserializationSchema AvroDeserializationSchema 后面有机会再讨论(最近工作非常忙)。
主要代码
pom文件
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.yq</groupId>
<artifactId>FlinkDemo</artifactId>
<version>1.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
<flink.version>1.7.2</flink.version>
<scala.binary.version>2.12</scala.binary.version>
</properties>
<dependencies>
<!-- Flink dependencies -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-core</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-test-utils-junit</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- core dependencies -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-twitter_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka_2.12</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-shaded-jackson</artifactId>
<version>2.7.9-6.0</version>
</dependency>
<!-- test dependencies -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-test-utils_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>test</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>test</scope>
<type>test-jar</type>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-statebackend-rocksdb_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-table_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-api</artifactId>
<version>1.7.21</version>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
</dependency>
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.21</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.16.20</version>
</dependency>
<!-- fastjson-->
<dependency>
<groupId>com.alibaba</groupId>
<artifactId>fastjson</artifactId>
<version> 1.2.31</version>
</dependency>
</dependencies>
<build>
<plugins>
<!-- maven-resources-plugin插件 -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.3</version>
<configuration>
<!--源码的Java版本-->
<source>1.8</source>
<!--运行环境的Java版本-->
<target>1.8</target>
<encoding>UTF8</encoding>
</configuration>
</plugin>
<!--simplify the name of example JARs for build-target/examples -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-antrun-plugin</artifactId>
<version>1.7</version>
<executions>
<execution>
<id>rename</id>
<phase>package</phase>
<goals>
<goal>run</goal>
</goals>
<configuration>
<target>
<copy file="${project.basedir}/target/FlinkDemo-${version}-KafkaConnector.jar" tofile="${project.basedir}/target/KafkaConnector.jar" />
<copy file="${project.basedir}/target/FlinkDemo-${version}-WordCount.jar" tofile="${project.basedir}/target/WordCount.jar" />
</target>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<!--This plugin's configuration is used to store Eclipse m2e settings only. It has no influence on the Maven build itself.-->
<plugin>
<groupId>org.eclipse.m2e</groupId>
<artifactId>lifecycle-mapping</artifactId>
<version>1.0.0</version>
<configuration>
<lifecycleMappingMetadata>
<pluginExecutions>
<pluginExecution>
<pluginExecutionFilter>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<versionRange>[2.9,)</versionRange>
<goals>
<goal>unpack</goal>
</goals>
</pluginExecutionFilter>
<action>
<ignore/>
</action>
</pluginExecution>
</pluginExecutions>
</lifecycleMappingMetadata>
</configuration>
</plugin>
</plugins>
</pluginManagement>
</build>
</project>
主要代码
package com.yq.kafka;
import org.apache.flink.api.common.JobExecutionResult;
import org.apache.flink.api.common.JobID;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer;
import org.apache.flink.util.Collector;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import java.util.Properties;
/**
* className: KafkaConnector
*
* iot-temp topic输入内容类似, hello Java, Hello Test, Hello Python, 先统计为DataStream<Tuple2<String, Integer>>
* 然后将DataStream<Tuple2<String, Integer>>转换为DataStream<String> , 最后将结果写入到kafka中,结果为Kafka and Flink says: (hello,3)格式
* @author EricYang
* @version 2019/3/11 14:50
*/
public class KafkaConnector {
private static final String KAFKA_BROKERS = "localhost:9092";
public static void main(String[] args) throws Exception {
final ParameterTool parameterTool = ParameterTool.fromArgs(args);
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.getConfig().setGlobalJobParameters(parameterTool);
Properties properties = new Properties();
properties.put("group.id", "flink-kafka-connector");
properties.put("bootstrap.servers", KAFKA_BROKERS);
properties.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "latest");
properties.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.put("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
DataStream<String> messageStream = env.addSource(
new FlinkKafkaConsumer<String>("iot-temp", new SimpleStringSchema(), properties));
DataStream<Tuple2<String, Integer>> counts =
// split up the lines in pairs (2-tuples) containing: (word,1)
messageStream.flatMap(new Tokenizer())
// group by the tuple field "0" and sum up tuple field "1"
.keyBy(0).sum(1);
DataStream<String> countsString =
counts.map(new MapFunction<Tuple2<String, Integer>, String>() {
private static final long serialVersionUID = -6867736771747690202L;
@Override
public String map(Tuple2<String, Integer> value) throws Exception {
System.out.println("kafka msg=" + value);
return "Kafka and Flink says: " + value;
}
});
FlinkKafkaProducer<String> myProducer = new FlinkKafkaProducer<String>(
KAFKA_BROKERS,
"topic1",
new SimpleStringSchema());
myProducer.setWriteTimestampToKafka(true);
countsString.addSink(myProducer);
if (parameterTool.has("output")) {
counts.writeAsText(parameterTool.get("output"));
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
counts.print();
}
// execute program
JobExecutionResult result = env.execute("Streaming Kafka");
JobID jobId = result.getJobID();
System.out.println("jobId=" + jobId);
}
public static final class Tokenizer implements FlatMapFunction<String, Tuple2<String, Integer>> {
@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {
// normalize and split the line
String[] tokens = value.toLowerCase().split("\\W+");
// emit the pairs
for (String token : tokens) {
if (token.length() > 0) {
out.collect(new Tuple2<>(token, 1));
}
}
}
}
}