map
说明: 一个输入对应一个输出
示例代码:
package com.hjt.yxh.apitest
import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
case class Student(id: String,name: String,age: Int)
class KeyWordFiter(keyWord: String) extends FilterFunction[Student]{
override def filter(t: Student): Boolean = {
t.name.contains(keyWord)
}
}
class StudentMapper extends MapFunction[String,Student]{
override def map(t: String): Student = {
val arry = t.split(",")
Student(arry(0),arry(1),arry(2).toInt)
}
}
object TransFormTest {
def main(args: Array[String]): Unit = {
//1.创建环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
//2.创建文件流
val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
val dataSteam = env.readTextFile(inputPath)
//map 匿名函数
val mapStream: DataStream[Student] = dataSteam.map(data=>{
val arry = data.split(",")
Student(arry(0),arry(1),arry(2).toInt)
})
mapStream.print("mapFunction:匿名函数")
//map 匿名类
val mapStream2: DataStream[Student] = dataSteam.map(new MapFunction[String,Student]{
override def map(t: String): Student = {
val arry = t.split(",")
Student(arry(0),arry(1),arry(2).toInt)
}
})
mapStream2.print("mapFunction:匿名类")
//map 自定义类
val mapStream3:DataStream[Student] = dataSteam.map(new StudentMapper)
mapStream3.print("mapFunction:自定义函数")
env.execute("transform test")
}
}
flatMap
对输入的对象做操作,输出可能是一个,或者多个,或者是空
代码:
package com.hjt.yxh.apitest
import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
case class Student(id: String,name: String,age: Int)
class StudentFlatMapper extends FlatMapFunction[String,Student]{
override def flatMap(t: String, collector: Collector[Student]): Unit = {
val arry = t.split(",")
collector.collect(Student(arry(0),arry(1),arry(2).toInt))
}
}
object TransFormTest {
def main(args: Array[String]): Unit = {
//1.创建环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
//2.创建文件流
val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
val dataSteam = env.readTextFile(inputPath)
//flatMap 匿名函数
val flatMapStream2 = dataSteam.flatMap(data=>{
val arry = data.split(",")
List(Student(arry(0),arry(1),arry(2).toInt))
})
flatMapStream2.print("匿名函数:flatMapFunction")
//flatMap 匿名类实现
val flatMapStream = dataSteam.flatMap(new FlatMapFunction[String,Student] {
override def flatMap(t: String, collector: Collector[Student]): Unit = {
val arry = t.split(",")
collector.collect(Student(arry(0),arry(1),arry(2).toInt))
}
})
flatMapStream.print("自定义匿名类flatMapFunction:")
//flatMap 自定义类
val flatMapStream3 = dataSteam.flatMap(new StudentFlatMapper)
flatMapStream3.print("自定义类:flatMapFunction")
// mapStream.print()
env.execute("transform test")
}
}
filter
对元素做过滤操作,只返回符合条件的元素
package com.hjt.yxh.apitest
import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
case class Student(id: String,name: String,age: Int)
class KeyWordFiter(keyWord: String) extends FilterFunction[Student]{
override def filter(t: Student): Boolean = {
t.name.contains(keyWord)
}
}
object TransFormTest {
def main(args: Array[String]): Unit = {
//1.创建环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
//2.创建文件流
val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
val dataSteam = env.readTextFile(inputPath)
//map 匿名函数
val mapStream: DataStream[Student] = dataSteam.map(data=>{
val arry = data.split(",")
Student(arry(0),arry(1),arry(2).toInt)
})
//filter 匿名函数lambda 表达式
val filterStream = mapStream.filter(student=>student.name.contains("wang"))
filterStream.print("filterFunction:匿名函数")
//自定义filter类
val filterStream2 = mapStream.filter(new KeyWordFiter("wang"))
filterStream2.print("filterFunction:自定义类")
//自定义匿名类
val filterStream3 = mapStream.filter(new FilterFunction[Student] {
override def filter(t: Student): Boolean = {
t.name.contains("wang")
}
})
filterStream3.print("filterFunction:匿名类")
env.execute("transform test")
}
}
keyBy与滚动聚合算子
简单的聚合算子有
- min() 分组后求最小值,如果是样例,最有最小的值是最小记录,其他的成员的值是第一条记录的值
- max() 求最大值
- sum() 求和
- minBy() 最小值所在的记录
- maxBy() 最大值所在的记录
keyBy与reduce
KeyedStream → DataStream:一个分组数据流的聚合操作,合并当前的元素 和上次聚合的结果,产生一个新的值,返回的流中包含每一次聚合的结果,而不是 只返回最后一次聚合的最终结果。
package com.hjt.yxh.apitest
import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
case class Student(id: String,name: String,age: Int)
class StudentReducer extends ReduceFunction[Student]{
override def reduce(t: Student, t1: Student): Student = {
if(t.age<t1.age) t else t1
}
}
object TransFormTest {
def main(args: Array[String]): Unit = {
//1.创建环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
//2.创建文件流
val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
val dataSteam = env.readTextFile(inputPath)
//简单聚合算子
val aggrateStream = mapStream.keyBy("id").minBy("age")
aggrateStream.print("滚动聚合算子")
//reduce 匿名函数
val reduceStream = mapStream.keyBy("id").reduce((curStudent,newStudent)=>{
if(curStudent.age<newStudent.age) Student(curStudent.id,curStudent.name,curStudent.age)
else Student(newStudent.id,newStudent.name,newStudent.age)
})
reduceStream.print("reduceFunction:匿名函数")
//reduce 匿名类
val reduceStream2 = mapStream.keyBy("id").reduce(new ReduceFunction[Student]{
override def reduce(t: Student, t1: Student): Student = {
if(t.age<t1.age) t else t1
}
})
reduceStream2.print("reduceFunction:匿名类")
//reduce 自定义函数
val reduceStream3 = mapStream.keyBy("id").reduce(new StudentReducer)
reduceStream3.print("reduceFunction:自定义类")
env.execute("transform test")
}
}
分流 split和Select
- split
DataStream → SplitStream:根据某些特征把一个 DataStream 拆分成两个或者 多个 DataStream - select
SplitStream→DataStream:从一个 SplitStream 中获取一个或者多个 DataStream
代码:
package com.hjt.yxh.apitest
import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
case class Student(id: String,name: String,age: Int)
object TransFormTest {
def main(args: Array[String]): Unit = {
//1.创建环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
//2.创建文件流
val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
val dataSteam = env.readTextFile(inputPath)
//map 匿名函数
val mapStream: DataStream[Student] = dataSteam.map(data=>{
val arry = data.split(",")
Student(arry(0),arry(1),arry(2).toInt)
})
//分流
val splitStream = mapStream.split(data=>{
if (data.age>=18 && data.age <=55) List("青年人") else if (data.age > 55) List("老年人") else List("未成年人")
})
val manStream = splitStream.select("青年人")
val childrenStream = splitStream.select("未成年人")
val allManStream = splitStream.select("成年人","未成年人","老年人")
manStream.print("青年人") childrenStream.print("未成年人")
allManStream.print("All")
env.execute("transform test")
}
}
合流
Union
DataStream → DataStream:对两个或者两个以上的 DataStream 进行 union 操 作,产生一个包含所有 DataStream 元素的新 DataStream。
package com.hjt.yxh.apitest
import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
case class Student(id: String,name: String,age: Int)
class KeyWordFiter(keyWord: String) extends FilterFunction[Student]{
override def filter(t: Student): Boolean = {
t.name.contains(keyWord)
}
}
class StudentMapper extends MapFunction[String,Student]{
override def map(t: String): Student = {
val arry = t.split(",")
Student(arry(0),arry(1),arry(2).toInt)
}
}
class StudentFlatMapper extends FlatMapFunction[String,Student]{
override def flatMap(t: String, collector: Collector[Student]): Unit = {
val arry = t.split(",")
collector.collect(Student(arry(0),arry(1),arry(2).toInt))
}
}
class StudentReducer extends ReduceFunction[Student]{
override def reduce(t: Student, t1: Student): Student = {
if(t.age<t1.age) t else t1
}
}
class StudentCoMapper extends CoMapFunction[(String,String),Student,(String,String,String)]{
override def map1(in1: (String, String)): (String,String,String) = {
(in1._1,in1._2,"正常人")
}
override def map2(in2: Student): (String,String,String) = {
(in2.id, in2.name, "病人")
}
}
class StudentCoFlatMapper extends CoFlatMapFunction[(String,String),Student,Tuple3[String,String,String]]{
override def flatMap1(in1: (String, String), collector: Collector[Tuple3[String,String,String]]): Unit = {
collector.collect(Tuple3(in1._1,in1._2,"正常人"))
}
override def flatMap2(in2: Student, collector: Collector[Tuple3[String,String,String]]): Unit = {
collector.collect(Tuple3(in2.id,in2.name,"病人"))
}
}
object TransFormTest {
def main(args: Array[String]): Unit = {
//1.创建环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
//2.创建文件流
val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
val dataSteam = env.readTextFile(inputPath)
//map 匿名函数
val mapStream: DataStream[Student] = dataSteam.map(data=>{
val arry = data.split(",")
Student(arry(0),arry(1),arry(2).toInt)
})
//分流
val splitStream = mapStream.split(data=>{
if (data.age>=18 && data.age <=55) List("青年人") else if (data.age > 55) List("老年人") else List("未成年人")
})
val manStream = splitStream.select("青年人")
val childrenStream = splitStream.select("未成年人")
val allManStream = splitStream.select("成年人","未成年人","老年人")
// manStream.print("青年人")
// childrenStream.print("未成年人")
// allManStream.print("All")
//合流
val unionStream:DataStream[Student] = manStream.union(childrenStream)
unionStream.print()
env.execute("transform test")
}
}
Connect
DataStream,DataStream → ConnectedStreams:连接两个保持他们类型的数 据流,两个数据流被 Connect 之后,只是被放在了一个同一个流中,内部依然保持 各自的数据和形式不发生任何变化,两个流相互独立。
CoMap
类似于map,只是操作的对象变成了合流后的对象
package com.hjt.yxh.apitest
import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
case class Student(id: String,name: String,age: Int)
class StudentCoMapper extends CoMapFunction[(String,String),Student,(String,String,String)]{
override def map1(in1: (String, String)): (String,String,String) = {
(in1._1,in1._2,"正常人")
}
override def map2(in2: Student): (String,String,String) = {
(in2.id, in2.name, "病人")
}
}
object TransFormTest {
def main(args: Array[String]): Unit = {
//1.创建环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
//2.创建文件流
val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
val dataSteam = env.readTextFile(inputPath)
//分流
val splitStream = mapStream.split(data=>{
if (data.age>=18 && data.age <=55) List("青年人") else if (data.age > 55) List("老年人") else List("未成年人")
})
val manStream = splitStream.select("青年人")
val childrenStream = splitStream.select("未成年人")
val allManStream = splitStream.select("成年人","未成年人","老年人")
// manStream.print("青年人")
// childrenStream.print("未成年人")
// allManStream.print("All")
//合流
val unionStream:DataStream[Student] = manStream.union(childrenStream)
// unionStream.print()
val newStream:DataStream[(String,String)] = childrenStream.map(data=>{
(data.id,data.name)
})
val conncetStream = newStream.connect(manStream)
//CoMap
val coMapStream = conncetStream.map(data1=>(data1._1,data1._2,"正常人"),data2=>(data2.id,data2.name,"不正常"))
coMapStream.print("CoMapFunction:匿名函数")
//CoMap
val coMapStream2 = conncetStream.map(new CoMapFunction[(String,String),Student,Tuple3[String,String,String]] {
override def map1(in1: (String, String)): (String,String,String) = {
(in1._1,in1._2,"正常人")
}
override def map2(in2: Student): (String,String,String) = {
(in2.id,in2.name,"病人")
}
})
coMapStream2.print("CoMapFunction:匿名类")
//CoMap
val coMapStream3 = conncetStream.map(new StudentCoMapper)
coMapStream3.print("CoMapFunction:自定义类")
env.execute("transform test")
}
}
CoFlatMap
package com.hjt.yxh.apitest
import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector
case class Student(id: String,name: String,age: Int)
class StudentCoFlatMapper extends CoFlatMapFunction[(String,String),Student,Tuple3[String,String,String]]{
override def flatMap1(in1: (String, String), collector: Collector[Tuple3[String,String,String]]): Unit = {
collector.collect(Tuple3(in1._1,in1._2,"正常人"))
}
override def flatMap2(in2: Student, collector: Collector[Tuple3[String,String,String]]): Unit = {
collector.collect(Tuple3(in2.id,in2.name,"病人"))
}
}
object TransFormTest {
def main(args: Array[String]): Unit = {
//1.创建环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
//2.创建文件流
val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
val dataSteam = env.readTextFile(inputPath)
//map 匿名函数
val mapStream: DataStream[Student] = dataSteam.map(data=>{
val arry = data.split(",")
Student(arry(0),arry(1),arry(2).toInt)
})
//分流
val splitStream = mapStream.split(data=>{
if (data.age>=18 && data.age <=55) List("青年人") else if (data.age > 55) List("老年人") else List("未成年人")
})
val manStream = splitStream.select("青年人")
val childrenStream = splitStream.select("未成年人")
val allManStream = splitStream.select("成年人","未成年人","老年人")
// manStream.print("青年人")
// childrenStream.print("未成年人")
// allManStream.print("All")
//合流
val unionStream:DataStream[Student] = manStream.union(childrenStream)
// unionStream.print()
val newStream:DataStream[(String,String)] = childrenStream.map(data=>{
(data.id,data.name)
})
val conncetStream = newStream.connect(manStream)
//CoFlatMap
val coFlatMapStream2 = conncetStream.flatMap(new CoFlatMapFunction[(String,String),Student,Tuple3[String,String,String]] {
override def flatMap1(in1: (String, String), collector: Collector[(String,String,String)]): Unit = {
collector.collect((in1._1,in1._2,"正常人"))
}
override def flatMap2(in2: Student, collector: Collector[Tuple3[String,String,String]]): Unit = {
collector.collect((in2.id,in2.name,"病人"))
}
})
coFlatMapStream2.print("CoFlatMapFunction:匿名类")
val coFlatMapStream3 = conncetStream.flatMap(new StudentCoFlatMapper)
coFlatMapStream3.print("CoFlatMapFunction:自定义类")
env.execute("transform test")
}
}