文章目录
map
package com.third_transform
import org.apache.flink.streaming.api.scala._
object Transform_Map {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val arr: Array[String] = Array("hello flink","hello world1","hello world1","hello world2")
val ds: DataStream[String] = env.fromCollection(arr)
val MapedDS: DataStream[String] = ds.map(r => {
val wordArr: Array[String] = r.split(" ")
wordArr(0) + "__" + wordArr(1)
})
MapedDS.print("stream")
env.execute()
}
}
flatmap
package com.third_transform
import org.apache.flink.streaming.api.scala._
object Transform_FlatMap {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val arr: Array[String] = Array("hello flink","hello world1","hello world1","hello world2")
val ds: DataStream[String] = env.fromCollection(arr)
val flatMapedDS: DataStream[String] = ds.flatMap(r => {
val wordArr: Array[String] = r.split(" ")
wordArr
})
flatMapedDS.print("stream")
env.execute()
}
}
filter
package com.third_transform
import org.apache.flink.streaming.api.scala._
object Transform_Filter {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val arr: Array[String] = Array("hello flink","hello world1","hello world1","hello world2")
val ds: DataStream[String] = env.fromCollection(arr)
val filteredDS: DataStream[String] = ds.flatMap(r => {
val wordArr: Array[String] = r.split(" ")
wordArr
}).filter(_ != "hello")
filteredDS.print("stream")
env.execute()
}
}
keyby + reduce
package com.third_transform
import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala._
object Transform_KeyBy {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val arr: Array[String] = Array("hello1 flink","hello1 world1","hello2 world1","hello3 world2")
val ds: DataStream[String] = env.fromCollection(arr)
// DataStream ===>>> KeyedStream
val keyByedKS: KeyedStream[(String, String), Tuple] = ds.map(r => {
val wordArr: Array[String] = r.split(" ")
(wordArr(0), wordArr(1))
}).keyBy(0)
// KeyedStream ===>>> DataStream
//一个分组数据流的聚合操作,合并当前的元素
//和上次聚合的结果,产生一个新的值,返回的流中包含每一次聚合的结果,而不是
//只返回最后一次聚合的最终结果
val reducedDS: DataStream[(String, String)] = keyByedKS.reduce((v1, v2) => (v1._1 + "---" + v2._1, v1._2 + "---" + v2._2))
reducedDS.print("stream")
env.execute()
}
}
滚动聚合算子(rolling Aggregation)
sum()
min()
max()
minBy()
maxBy()
package com.third_transform
import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala._
object Transform_RollingAggregation {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val arr: Array[String] = Array("hello 1","hello 2","world 2","world 3")
val ds: DataStream[String] = env.fromCollection(arr)
val keyByedKS: KeyedStream[(String, Int), Tuple] = ds.map(r => {
val wordArr: Array[String] = r.split(" ")
(wordArr(0), wordArr(1).toInt)
}).keyBy(0)
val sumedDS: DataStream[(String, Int)] = keyByedKS.sum(1)
val minedDS: DataStream[(String, Int)] = keyByedKS.min(1)
val maxedDS: DataStream[(String, Int)] = keyByedKS.max(1)
val minByedDS: DataStream[(String, Int)] = keyByedKS.minBy(1)
val maxByedDS: DataStream[(String, Int)] = keyByedKS.maxBy(1)
sumedDS.print("stream1")
minedDS.print("stream2")
maxedDS.print("stream3")
minByedDS.print("stream4")
maxByedDS.print("stream5")
env.execute()
}
}
split和select
DataStream → SplitStream:根据某些特征把一个 DataStream 拆分成两个或者多个 DataStream。
SplitStream→DataStream:从一个 SplitStream中获取一个或者多个 DataStream。
package com.third_transform
import org.apache.flink.streaming.api.scala._
object Transform_SplitAndSelect {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val arr: Array[(String, Int)] = Array(("hello1", 1),("hello2", 2), ("hello2", 3), ("hello3", 4))
val ds: DataStream[(String, Int)] = env.fromCollection(arr)
val splitedSS: SplitStream[(String, Int)] = ds.split(r => {
if (r._2 > 2) Seq("big") else Seq("small")
})
val bigDS: DataStream[(String, Int)] = splitedSS.select("big")
val smallDS: DataStream[(String, Int)] = splitedSS.select("small")
val allDS: DataStream[(String, Int)] = splitedSS.select("big", "small")
bigDS.print("bigDS")
smallDS.print("smallDS")
allDS.print("allDS")
env.execute()
}
}
connect和comap
package com.third_transform
import org.apache.flink.streaming.api.scala._
object Transform_ConnectAndComap {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val arr1: Array[(String, Int)] = Array(("hello1", 1),("hello2", 2), ("hello2", 3), ("hello3", 4))
val ds1: DataStream[(String, Int)] = env.fromCollection(arr1)
val arr2: Array[(Int, String)] = Array((1,"hello"), (2,"hello"), (3,"hello"))
val ds2: DataStream[(Int, String)] = env.fromCollection(arr2)
val connectedCS: ConnectedStreams[(String, Int), (Int, String)] = ds1.connect(ds2)
val coMap: DataStream[(Any, Any)] = connectedCS.map(r1 => (r1._1, r1._2 - 5), r2 => (r2._1 + 5, r2._2))
coMap.print("stream")
env.execute()
}
}
union
package com.third_transform
import org.apache.flink.streaming.api.scala._
object Transform_Union {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
val arr1: Array[(String, Int)] = Array(("hello1", 1),("hello2", 2), ("hello2", 3), ("hello3", 4))
val ds1: DataStream[(String, Int)] = env.fromCollection(arr1)
val arr2: Array[(String, Int)] = Array(("hello2", 2), ("hello2", 3), ("hello3", 4))
val ds2: DataStream[(String, Int)] = env.fromCollection(arr2)
val unionedDS: DataStream[(String, Int)] = ds1.union(ds2)
unionedDS.print("stream")
env.execute()
}
}
待续
split和select的新写法