1.新建一个maven工程——myflink.
2.导入pom.xml相关依赖。
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
<flink.version>1.7.2</flink.version>
</properties>
<dependencies>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-scala -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-streaming-scala -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-clients -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
</dependencies>
3.新建MyRandom.scala
package cn.alisa.myflink.exp
import org.apache.flink.streaming.api.functions.ProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
import scala.collection.mutable.ArrayBuffer
object MyRandom {
def main(args: Array[String]): Unit = {
//创建流式数据处理环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//设置并行度,也可以理解成多线程,不设置成1的话。这里设置成1的话表示一个线程
env.setParallelism(1)
val arr = ArrayBuffer[Int]()
for (i <- 1 to 100){
arr.append(i)
}
//从数组中读取数据
var ds=env.fromCollection(
arr
)
//奇数一堆,偶数一堆
val res = ds.split(num => {
if (num % 2 == 0) {
Seq("ou")
} else {
Seq("ji")
}
})
val ou = res.select("ou")
val ji = res.select("ji")
ou.print("ou")
ji.print("ji")
env.execute("shuffle")
}
}
运行结果如下:
如果env.setParallelism(1)这个不写的话,那么线程数就由你自己的机器的核数决定。本人的机器为6核12线程。所以最多会用上12个线程。
3.需要注意的是,在上述代码中用split算子(将数据流切分成多个数据流)进行分割时,显示为过时,可以使用SideOutPut替换split实现分流。
具体实现代码如下:
package cn.alisa.myflink.exp
import org.apache.flink.streaming.api.functions.ProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
import scala.collection.mutable.ArrayBuffer
object MyRandom {
def main(args: Array[String]): Unit = {
//创建流式数据处理环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//设置并行度,也可以理解成多线程,不设置成1的话
env.setParallelism(1)
val arr = ArrayBuffer[Int]()
for (i <- 1 to 100){
arr.append(i)
}
//从数组中读取数据
var ds=env.fromCollection(
arr
)
val res = ds.process(new JO())
res.getSideOutput(new OutputTag[String]("otest")).print("ou")
res.getSideOutput(new OutputTag[String]("jtest")).print("ji")
//执行
env.execute("split")
}
}
//split过期了,可以使用以下:
class JO() extends ProcessFunction[Int,Int]{
lazy val otag:OutputTag[String]=new OutputTag[String]("otest")
lazy val jtag:OutputTag[String]=new OutputTag[String]("jtest")
override def processElement(i: Int, context: ProcessFunction[Int, Int]#Context, collector: Collector[Int]): Unit = {
if(i%2==0){
context.output(otag,"ou:"+i)
}else{
context.output(jtag,"ji:"+i)
}
}
}