Flink开发笔记
1. pom.xml
<properties>
<maven.compiler.source>8</maven.compiler.source>
<maven.compiler.target>8</maven.compiler.target>
<flink.version>1.11.3</flink.version>
<scala.binary.version>2.11</scala.binary.version>
</properties>
<!--在idea中调试flink程序-->
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.11.8</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!--重点1-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!--重点2-->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<build>
<!--注意maven版本太低导致的问题,当前用的3.6.3-->
<plugins>
<!--重点3-->
<!--该插件用于将scala代码编译成class文件-->
<plugin>
<groupId>net.alchim31.maven</groupId>
<artifactId>scala-maven-plugin</artifactId>
<version>4.4.0</version>
<executions>
<execution>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<!--重点1-->
<!--打fat包插件-->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-assembly-plugin</artifactId>
<version>3.3.0</version>
<configuration>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
<executions>
<execution>
<id>make-assembly</id>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
</plugin>
</plugins>
</build>
2. 并行度优先级问题
3. wordcount再standalone结点(默认配置)提交资源不足的问题
4. 集群提交常用命令
-
bin/flink list
-
bin/flink cancel
-
bin/flink run -c com.xiaofan.wc.StreamWordCount -p 2 /home/hadoop/fanjh/jar/FlinkTutorial-1.0-jar-with-dependencies.jar --host 192.168.1.27 --port 9999
-
yarn application -list
-
yarn application -kill application_1617758450736_0066
-
bin/flink run -m yarn-cluster -c com.xiaofan.wc.StreamWordCount -p 2 /home/hadoop/fanjh/jar/FlinkTurorial1.12.2-1.0-SNAPSHOT-jar-with-dependencies.jar --host 192.168.1.27 --port 9999
-
bin/flink run-application -t yarn-application
-c com.xiaofan.wc.StreamWordCount
-Denv.hadoop.conf.dir=/home/hadoop/app/hadoop/etc/hadoop
-Dyarn.application.queue=“flink”
-Djobmanager.memory.process.size=1024m
-Dtaskmanager.memory.process.size=1024m
-Dtaskmanager.numberOfTaskSlots=4
-Dparallelism.default=4
-Dyarn.application-attempts=4
-Dyarn.application.name=“test”
hdfs://cluster/flink-1.12/jar/FlinkTurorial1.12.2-1.0-SNAPSHOT-jar-with-dependencies.jar --host 192.168.1.27 --port 9999
5. 执行图
6. slot共享组
env.disableOperatorChaining() // 全局禁用链条
.filter(_.nonEmpty).disableChaining() // 断开前后的chain链条
.map((_, 1)).startNewChain() // 重新开启一个chain链条,断开前面的
断开链条的最终目的:不想共用slot,但是有可能再次共用了
.flatMap(_.split(" ")).slotSharingGroup("b") // slot共享组
7. 自定义Source
package com.xiaofan.apitest.source
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala._
import java.util.Random
case class SensorReading(id: String, timestamp: Long, temperature: Double)
/**
* 自定义 SourceFunction
*/
object CustomSourceTest {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// env.setParallelism(1)
val data: DataStream[SensorReading] = env.addSource(new MySensorSourceFunction)
data.print()
env.execute("custom source test")
}
}
class MySensorSourceFunction extends SourceFunction[SensorReading] {
// 定义一个标识为flag,用来表示数据源是否正常发送数据
var running: Boolean = true
override def run(ctx: SourceFunction.SourceContext[SensorReading]): Unit = {
val rand = new Random()
// 随机生成一组(10个)传感器的初始温度:(id, temperature)
var curTemp = 1.to(10).map(i => ("sensor_" + i, rand.nextDouble() * 100))
// 定义无限循环,不停的产生数据,除非被 cancel
while (running) {
// 在上次数据的基础上微调,更新温度值(高斯分布)
curTemp = curTemp.map(
data => (data._1, data._2 + rand.nextGaussian())
)
// 获取当前时间戳,加入到数据中
val curTime: Long = System.currentTimeMillis()
curTemp.foreach(
data => ctx.collect(SensorReading(data._1, curTime, data._2))
)
// 间隔500ms
Thread.sleep(500)
}
}
override def cancel(): Unit = running = false
}
8. KeyedStream
/*
min: 取相同key的第一条元素的值
minBy: 取相同key的最小的元素的所有信息
reduce:自定义聚合函数,取最小的温度,同时取最新的时间
*/
// 1. 分组聚合,输出每个传感器当前的最小值,时间戳是这条数据对应的时间
val aggStream: DataStream[SensorReading] = dataStream
.keyBy(_.id)
.minBy("temperature")
aggStream.print()
// 2. 需要输出当前最小的温度值,以及最近的时间戳,要用reduce
val resultStream: DataStream[SensorReading] = dataStream
.keyBy(_.id)
.reduce(
(curState, newData) => {
SensorReading(curState.id, newData.timestamp, curState.temperature.min(newData.temperature))
}
)
9. SplitStream(Please use side outputs instead of split/select)
// 3.1. 分流:将传感器温度数据分成低温、高温两条流
val splitStream: SplitStream[SensorReading] = dataStream.split(
data => {
if (data.temperature > 30.0) Seq("high") else Seq("low")
}
)
val highStream: DataStream[SensorReading] = splitStream.select("high")
val lowStream: DataStream[SensorReading] = splitStream.select("low")
10. ConnectedStreams
// 3.2. 合流 connect
val warningStream: DataStream[(String, Double)] = highStream.map { data => (data.id, data.temperature) }
val connectedStream: ConnectedStreams[(String, Double), SensorReading] = warningStream.connect(lowStream)
// 用coMap对数据进行分别处理
val coMapResultStream: DataStream[Product] = connectedStream.map(
warningData => (warningData._1, warningData._2, "warning!"),
lowTempData => (lowTempData.id, "healthy~")
)
coMapResultStream.print("coMapResultStream")
// 3.3. union 合流(注意和connect流的区别)
val unionStream: DataStream[SensorReading] = highStream.union(lowStream)
11. 富函数
12. Broadcast流
import org.apache.flink.api.common.state.BroadcastState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ReadOnlyBroadcastState;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.streaming.api.datastream.BroadcastConnectedStream;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction;
import org.apache.flink.util.Collector;
public class State_BroadcastState {
public static void main(String[] args) {
//控制流发送到普通流后,普通流会收到一个广播状态
//1.创建环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStreamSource<String> inputDS = env.socketTextStream("192.168.1.27", 8888);
DataStreamSource<String> controlDS = env.socketTextStream("192.168.1.27", 9999);
//TODO 1.把其中一条流(控制流) 广播出去
//定义一个Map状态描述器,控制流会把这个状态广播出去
MapStateDescriptor<String, String> broadcast = new MapStateDescriptor<>("boradcast-state", Types.STRING, Types.STRING);
BroadcastStream<String> contrlBS = controlDS.broadcast(broadcast);
//TODO 2.把另一条流和广播流关联起来
BroadcastConnectedStream<String, String> inputBCS = inputDS.connect(contrlBS);
//TODO 3.调用Process
inputBCS.process(
new BroadcastProcessFunction<String, String, String>() {
/*
获取广播状态,获取数据进行处理
*/
@Override
public void processElement(String value, ReadOnlyContext ctx, Collector<String> out) throws Exception {
//TODO 5.通过上下文获取广播状态,取出里面的值
ReadOnlyBroadcastState<String, String> broadcastState = ctx.getBroadcastState(broadcast);
String aSwitch = broadcastState.get("switch");
if ("1".equals(aSwitch)) {
out.collect("切换到1的逻辑");
} else if ("2".equals(aSwitch)) {
out.collect("切换到2的逻辑");
}
}
/**
* 处理广播流的数据:这里主要定义,什么数据往广播状态存
* @param value
* @param ctx
* @param out
* @throws Exception
*/
@Override
public void processBroadcastElement(String value, Context ctx, Collector<String> out) throws Exception {
// TODO 4.通过上下文获取广播状态,并往广播状态里存数据
BroadcastState<String, String> broadcastState = ctx.getBroadcastState(broadcast);
broadcastState.put("switch", value);
}
}
).print();
//提交任务
try {
env.execute();
} catch (Exception e) {
e.printStackTrace();
}
}
}