FLink学习笔记:04-Flink DataStream的TransForm操作

map

说明: 一个输入对应一个输出

示例代码:

package com.hjt.yxh.apitest

import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

case class Student(id: String,name: String,age: Int)

class KeyWordFiter(keyWord: String) extends FilterFunction[Student]{
  override def filter(t: Student): Boolean = {
    t.name.contains(keyWord)
  }
}

class StudentMapper extends MapFunction[String,Student]{
  override def map(t: String): Student = {
    val arry = t.split(",")
    Student(arry(0),arry(1),arry(2).toInt)
  }
}

object TransFormTest {

  def main(args: Array[String]): Unit = {

    //1.创建环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //2.创建文件流
    val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
    val dataSteam = env.readTextFile(inputPath)

    //map 匿名函数
    val mapStream: DataStream[Student] = dataSteam.map(data=>{
      val arry = data.split(",")
      Student(arry(0),arry(1),arry(2).toInt)
    })
    mapStream.print("mapFunction:匿名函数")

    //map 匿名类
    val mapStream2: DataStream[Student] = dataSteam.map(new MapFunction[String,Student]{
      override def map(t: String): Student = {
        val arry = t.split(",")
        Student(arry(0),arry(1),arry(2).toInt)
      }
    })
    mapStream2.print("mapFunction:匿名类")

    //map 自定义类
    val mapStream3:DataStream[Student] = dataSteam.map(new StudentMapper)
    mapStream3.print("mapFunction:自定义函数")

    env.execute("transform test")
  }
}

flatMap

对输入的对象做操作,输出可能是一个,或者多个,或者是空

代码:

package com.hjt.yxh.apitest

import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

case class Student(id: String,name: String,age: Int)

class StudentFlatMapper extends FlatMapFunction[String,Student]{
  override def flatMap(t: String, collector: Collector[Student]): Unit = {
    val arry = t.split(",")
    collector.collect(Student(arry(0),arry(1),arry(2).toInt))
  }
}

object TransFormTest {

  def main(args: Array[String]): Unit = {

    //1.创建环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //2.创建文件流
    val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
    val dataSteam = env.readTextFile(inputPath)

    //flatMap 匿名函数
    val flatMapStream2 = dataSteam.flatMap(data=>{
      val arry = data.split(",")
      List(Student(arry(0),arry(1),arry(2).toInt))
    })
    flatMapStream2.print("匿名函数:flatMapFunction")

    //flatMap 匿名类实现
    val flatMapStream = dataSteam.flatMap(new FlatMapFunction[String,Student] {
      override def flatMap(t: String, collector: Collector[Student]): Unit = {
        val arry = t.split(",")
        collector.collect(Student(arry(0),arry(1),arry(2).toInt))
      }
    })
    flatMapStream.print("自定义匿名类flatMapFunction:")

    //flatMap 自定义类
    val flatMapStream3 = dataSteam.flatMap(new StudentFlatMapper)
    flatMapStream3.print("自定义类:flatMapFunction")
//    mapStream.print()
    env.execute("transform test")
  }
}

filter

对元素做过滤操作,只返回符合条件的元素

package com.hjt.yxh.apitest

import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

case class Student(id: String,name: String,age: Int)

class KeyWordFiter(keyWord: String) extends FilterFunction[Student]{
  override def filter(t: Student): Boolean = {
    t.name.contains(keyWord)
  }
}

object TransFormTest {

  def main(args: Array[String]): Unit = {

    //1.创建环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //2.创建文件流
    val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
    val dataSteam = env.readTextFile(inputPath)

    //map 匿名函数
    val mapStream: DataStream[Student] = dataSteam.map(data=>{
      val arry = data.split(",")
      Student(arry(0),arry(1),arry(2).toInt)
    })
    
    //filter 匿名函数lambda 表达式
    val filterStream = mapStream.filter(student=>student.name.contains("wang"))
    filterStream.print("filterFunction:匿名函数")

    //自定义filter类
    val filterStream2 = mapStream.filter(new KeyWordFiter("wang"))
    filterStream2.print("filterFunction:自定义类")

    //自定义匿名类
    val filterStream3 = mapStream.filter(new FilterFunction[Student] {
      override def filter(t: Student): Boolean = {
        t.name.contains("wang")
      }
    })
    filterStream3.print("filterFunction:匿名类")
    
    env.execute("transform test")

  }

}

keyBy与滚动聚合算子

简单的聚合算子有

  • min() 分组后求最小值,如果是样例,最有最小的值是最小记录,其他的成员的值是第一条记录的值
  • max() 求最大值
  • sum() 求和
  • minBy() 最小值所在的记录
  • maxBy() 最大值所在的记录

keyBy与reduce

KeyedStream → DataStream:一个分组数据流的聚合操作,合并当前的元素 和上次聚合的结果,产生一个新的值,返回的流中包含每一次聚合的结果,而不是 只返回最后一次聚合的最终结果。

package com.hjt.yxh.apitest

import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

case class Student(id: String,name: String,age: Int)


class StudentReducer extends ReduceFunction[Student]{
  override def reduce(t: Student, t1: Student): Student = {
     if(t.age<t1.age) t else t1
  }
}

object TransFormTest {

  def main(args: Array[String]): Unit = {

    //1.创建环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //2.创建文件流
    val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
    val dataSteam = env.readTextFile(inputPath)

    //简单聚合算子
    val aggrateStream = mapStream.keyBy("id").minBy("age")
    aggrateStream.print("滚动聚合算子")

    //reduce 匿名函数
    val reduceStream = mapStream.keyBy("id").reduce((curStudent,newStudent)=>{
      if(curStudent.age<newStudent.age) Student(curStudent.id,curStudent.name,curStudent.age)
      else Student(newStudent.id,newStudent.name,newStudent.age)
    })
    reduceStream.print("reduceFunction:匿名函数")

    //reduce 匿名类
    val reduceStream2 = mapStream.keyBy("id").reduce(new ReduceFunction[Student]{
      override def reduce(t: Student, t1: Student): Student = {
        if(t.age<t1.age) t else t1
      }
    })
    reduceStream2.print("reduceFunction:匿名类")
    //reduce 自定义函数
    val reduceStream3 = mapStream.keyBy("id").reduce(new StudentReducer)
    reduceStream3.print("reduceFunction:自定义类")

    env.execute("transform test")
  }

}

分流 split和Select

  • split
    DataStream → SplitStream:根据某些特征把一个 DataStream 拆分成两个或者 多个 DataStream
  • select
    SplitStream→DataStream:从一个 SplitStream 中获取一个或者多个 DataStream

代码:

package com.hjt.yxh.apitest

import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

case class Student(id: String,name: String,age: Int)


object TransFormTest {

  def main(args: Array[String]): Unit = {

    //1.创建环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //2.创建文件流
    val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
    val dataSteam = env.readTextFile(inputPath)

    //map 匿名函数
    val mapStream: DataStream[Student] = dataSteam.map(data=>{
      val arry = data.split(",")
      Student(arry(0),arry(1),arry(2).toInt)
    })

    //分流
    val splitStream = mapStream.split(data=>{
      if (data.age>=18 && data.age <=55) List("青年人") else if (data.age > 55) List("老年人") else List("未成年人")
    })
    val manStream = splitStream.select("青年人")
    val childrenStream = splitStream.select("未成年人")
    val allManStream = splitStream.select("成年人","未成年人","老年人")
    manStream.print("青年人")    childrenStream.print("未成年人")
    allManStream.print("All")
    
    env.execute("transform test")
  }
}

合流

Union

DataStream → DataStream:对两个或者两个以上的 DataStream 进行 union 操 作,产生一个包含所有 DataStream 元素的新 DataStream。

package com.hjt.yxh.apitest

import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

case class Student(id: String,name: String,age: Int)

class KeyWordFiter(keyWord: String) extends FilterFunction[Student]{
  override def filter(t: Student): Boolean = {
    t.name.contains(keyWord)
  }
}

class StudentMapper extends MapFunction[String,Student]{
  override def map(t: String): Student = {
    val arry = t.split(",")
    Student(arry(0),arry(1),arry(2).toInt)
  }
}

class StudentFlatMapper extends FlatMapFunction[String,Student]{
  override def flatMap(t: String, collector: Collector[Student]): Unit = {
    val arry = t.split(",")
    collector.collect(Student(arry(0),arry(1),arry(2).toInt))
  }
}

class StudentReducer extends ReduceFunction[Student]{
  override def reduce(t: Student, t1: Student): Student = {
     if(t.age<t1.age) t else t1
  }
}

class StudentCoMapper extends CoMapFunction[(String,String),Student,(String,String,String)]{
  override def map1(in1: (String, String)): (String,String,String) = {
    (in1._1,in1._2,"正常人")
  }

  override def map2(in2: Student): (String,String,String) = {
    (in2.id, in2.name, "病人")
  }
}

class StudentCoFlatMapper extends CoFlatMapFunction[(String,String),Student,Tuple3[String,String,String]]{
  override def flatMap1(in1: (String, String), collector: Collector[Tuple3[String,String,String]]): Unit = {
    collector.collect(Tuple3(in1._1,in1._2,"正常人"))
  }

  override def flatMap2(in2: Student, collector: Collector[Tuple3[String,String,String]]): Unit = {
    collector.collect(Tuple3(in2.id,in2.name,"病人"))
  }
}

object TransFormTest {

  def main(args: Array[String]): Unit = {

    //1.创建环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //2.创建文件流
    val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
    val dataSteam = env.readTextFile(inputPath)

    //map 匿名函数
    val mapStream: DataStream[Student] = dataSteam.map(data=>{
      val arry = data.split(",")
      Student(arry(0),arry(1),arry(2).toInt)
    })

    //分流
    val splitStream = mapStream.split(data=>{
      if (data.age>=18 && data.age <=55) List("青年人") else if (data.age > 55) List("老年人") else List("未成年人")
    })
    val manStream = splitStream.select("青年人")
    val childrenStream = splitStream.select("未成年人")
    val allManStream = splitStream.select("成年人","未成年人","老年人")
//    manStream.print("青年人")
//    childrenStream.print("未成年人")
//    allManStream.print("All")

    //合流
    val unionStream:DataStream[Student] = manStream.union(childrenStream)
    unionStream.print()

    env.execute("transform test")

  }

}

Connect

DataStream,DataStream → ConnectedStreams:连接两个保持他们类型的数 据流,两个数据流被 Connect 之后,只是被放在了一个同一个流中,内部依然保持 各自的数据和形式不发生任何变化,两个流相互独立。

CoMap

类似于map,只是操作的对象变成了合流后的对象

package com.hjt.yxh.apitest

import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

case class Student(id: String,name: String,age: Int)

class StudentCoMapper extends CoMapFunction[(String,String),Student,(String,String,String)]{
  override def map1(in1: (String, String)): (String,String,String) = {
    (in1._1,in1._2,"正常人")
  }

  override def map2(in2: Student): (String,String,String) = {
    (in2.id, in2.name, "病人")
  }
}

object TransFormTest {

  def main(args: Array[String]): Unit = {

    //1.创建环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //2.创建文件流
    val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
    val dataSteam = env.readTextFile(inputPath)


    //分流
    val splitStream = mapStream.split(data=>{
      if (data.age>=18 && data.age <=55) List("青年人") else if (data.age > 55) List("老年人") else List("未成年人")
    })
    val manStream = splitStream.select("青年人")
    val childrenStream = splitStream.select("未成年人")
    val allManStream = splitStream.select("成年人","未成年人","老年人")
//    manStream.print("青年人")
//    childrenStream.print("未成年人")
//    allManStream.print("All")

    //合流
    val unionStream:DataStream[Student] = manStream.union(childrenStream)
//    unionStream.print()
    val newStream:DataStream[(String,String)] = childrenStream.map(data=>{
      (data.id,data.name)
    })
    val conncetStream = newStream.connect(manStream)

    //CoMap
    val coMapStream = conncetStream.map(data1=>(data1._1,data1._2,"正常人"),data2=>(data2.id,data2.name,"不正常"))
    coMapStream.print("CoMapFunction:匿名函数")

    //CoMap
    val coMapStream2 = conncetStream.map(new CoMapFunction[(String,String),Student,Tuple3[String,String,String]] {
      override def map1(in1: (String, String)): (String,String,String) = {
        (in1._1,in1._2,"正常人")
      }
      override def map2(in2: Student): (String,String,String) = {
        (in2.id,in2.name,"病人")
      }
    })
    coMapStream2.print("CoMapFunction:匿名类")

    //CoMap
    val coMapStream3 = conncetStream.map(new StudentCoMapper)
    coMapStream3.print("CoMapFunction:自定义类")

    env.execute("transform test")

  }

}

CoFlatMap

package com.hjt.yxh.apitest

import org.apache.flink.api.common.functions.{FilterFunction, FlatMapFunction, MapFunction, ReduceFunction}
import org.apache.flink.streaming.api.functions.co.{CoFlatMapFunction, CoMapFunction}
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.util.Collector

case class Student(id: String,name: String,age: Int)

class StudentCoFlatMapper extends CoFlatMapFunction[(String,String),Student,Tuple3[String,String,String]]{
  override def flatMap1(in1: (String, String), collector: Collector[Tuple3[String,String,String]]): Unit = {
    collector.collect(Tuple3(in1._1,in1._2,"正常人"))
  }

  override def flatMap2(in2: Student, collector: Collector[Tuple3[String,String,String]]): Unit = {
    collector.collect(Tuple3(in2.id,in2.name,"病人"))
  }
}

object TransFormTest {

  def main(args: Array[String]): Unit = {

    //1.创建环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    //2.创建文件流
    val inputPath = "D:\\java_workspace\\hadoop\\FlinkDemo\\src\\main\\resources\\Data\\student.txt"
    val dataSteam = env.readTextFile(inputPath)

    //map 匿名函数
    val mapStream: DataStream[Student] = dataSteam.map(data=>{
      val arry = data.split(",")
      Student(arry(0),arry(1),arry(2).toInt)
    })

    //分流
    val splitStream = mapStream.split(data=>{
      if (data.age>=18 && data.age <=55) List("青年人") else if (data.age > 55) List("老年人") else List("未成年人")
    })
    val manStream = splitStream.select("青年人")
    val childrenStream = splitStream.select("未成年人")
    val allManStream = splitStream.select("成年人","未成年人","老年人")
//    manStream.print("青年人")
//    childrenStream.print("未成年人")
//    allManStream.print("All")

    //合流
    val unionStream:DataStream[Student] = manStream.union(childrenStream)
//    unionStream.print()
    val newStream:DataStream[(String,String)] = childrenStream.map(data=>{
      (data.id,data.name)
    })
    val conncetStream = newStream.connect(manStream)

    //CoFlatMap
    val coFlatMapStream2 = conncetStream.flatMap(new CoFlatMapFunction[(String,String),Student,Tuple3[String,String,String]] {
      override def flatMap1(in1: (String, String), collector: Collector[(String,String,String)]): Unit = {
        collector.collect((in1._1,in1._2,"正常人"))
      }

      override def flatMap2(in2: Student, collector: Collector[Tuple3[String,String,String]]): Unit = {
        collector.collect((in2.id,in2.name,"病人"))
      }
    })
    coFlatMapStream2.print("CoFlatMapFunction:匿名类")

    val coFlatMapStream3 = conncetStream.flatMap(new StudentCoFlatMapper)
    coFlatMapStream3.print("CoFlatMapFunction:自定义类")

    env.execute("transform test")

  }

}

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值