【Flink】DataSet Function练习

  • FlatMap
import org.apache.flink.api.scala.{ExecutionEnvironment, _}

object FlatMapFunction01 {
  def main(args: Array[String]): Unit = {
    val env = ExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()
    //创建模拟测试数据
    val text = env.fromElements("flink hadoop", "spark hive")
    //使用flatMap来进行数据的切割,将每次的数据都作用于该function
    val text2 = text.flatMap(_.split("\\s+"))
    text2.print()
  }
}
  • Map
import org.apache.flink.api.scala.{ExecutionEnvironment, _}

object MapFunction01 {
  def main(args: Array[String]): Unit = {
    //创建运行时环境
    val env = ExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()
    //创建模拟测试数据
    val text = env.fromElements("flink hadoop", "spark hive").flatMap(_.split("\\s+"))
    //转换为大写,并计算其长度
    val res1 = text.map(str => (str.toUpperCase(), str.trim.length))
    res1.print()
    //使用case class
    val res2 = text.map(line => LineCount(line.toUpperCase(), line.length))
    res2.print()
  }
}
case class LineCount(line: String, count: Int) {
  override def toString: String = line + " " + count
}
  • MapPartition
import java.lang

import org.apache.flink.api.common.functions.MapPartitionFunction
import org.apache.flink.api.scala.{ExecutionEnvironment, _}
import org.apache.flink.util.Collector

object MapPartitionFunction01 {
  def main(args: Array[String]): Unit = {
    //创建运行时环境
    val env = ExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()
    //创建模拟测试数据
    val text = env.fromElements("flink hadoop", "spark hive").flatMap(_.split("\\s+"))
    //以partition为粒度进行count计算
    //将mapPartition中的方法安装partition作用于DataSet,产生另一个DataSet
    //比较适合没有分组的数据,如果是需要转换单个的元素,更适合用map方法
    //MapPartitionFunction[String, Long] String为输入元素的类型,Long为返回元素的类型,因为是计数,所以是Long
    val text2 = text.mapPartition(new MapPartitionFunction[String, Long]() {
      override def mapPartition(iterable: lang.Iterable[String], collector: Collector[Long]): Unit = {
        var count = 0
        val iterator = iterable.iterator()
        while (iterator.hasNext) {
          iterator.next()
          count += 1
        }
        collector.collect(count)
      }
    })
    text2.print()

    //全体数据加上一个前缀
    val text3 = text.mapPartition(new MapPartitionFunction[String, String] {
      override def mapPartition(values: lang.Iterable[String], out: Collector[String]): Unit = {
        val iterator = values.iterator()
        while (iterator.hasNext) {
          var str = iterator.next()
          str = "prefix-" + str
          out.collect(str)
        }
      }
    })
    text3.print()
  }
}
  • Reduce
import org.apache.flink.api.common.functions.ReduceFunction
import org.apache.flink.api.scala._
import org.apache.flink.api.scala.ExecutionEnvironment

object ReduceFunction01 {
  def main(args: Array[String]): Unit = {
    //创建运行时环境
    val env = ExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()
    //创建模拟测试数据
    val text = env.fromElements("flink hadoop", "spark hive hadoop flink", "flink").flatMap(_.split("\\s+"))
    //reduce()将输入的数据通过自定义的处理逻辑,返回一个结果
    val text2 = text.reduce((str1, str2) => str1.concat(str2))
    text2.print()
    println("------------------------------------------------")
    val text3 = text.reduce(new ReduceFunction[String]{
      override def reduce(value1: String, value2: String): String = {
        println("The first value to combine:" + value1)
        println("The second value to combine:" + value2)
        value1.concat(value2)
      }
    })
    text3.print()
  }
}
  • ReduceGroup
import java.lang

import org.apache.flink.api.common.functions.GroupReduceFunction
import org.apache.flink.api.scala._
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.util.Collector

object ReduceGroupFunction01 {
  def main(args: Array[String]): Unit = {
    //创建运行时环境
    val env = ExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()
    //创建模拟测试数据
    val text = env.fromElements("flink hadoop", "spark hive hadoop flink", "flink").flatMap(_.split("\\s+"))
    //一般是先分组,然后根据每组的数据进行计算
    val text2 = text.map((_, 1)).groupBy(0).reduceGroup(new GroupReduceFunction[(String, Int), (String, Int)] {
      override def reduce(values: lang.Iterable[(String, Int)], out: Collector[(String, Int)]): Unit = {
        val iterator = values.iterator()
        var word = ""
        var cnt = 0
        while (iterator.hasNext) {
          val item = iterator.next()
          word = item._1
          cnt += item._2
        }
        out.collect((word, cnt))
      }
    })
    text2.print()
  }
}
  • Join
import org.apache.flink.api.scala.{ExecutionEnvironment, _}

object JoinFunction01 {
  def main(args: Array[String]): Unit = {
    val env = ExecutionEnvironment.getExecutionEnvironment
    env.getConfig.disableSysoutLogging()

    val stuDataSet = env.fromElements(
      (1, "张三", "男", 21),
      (2, "彭霞", "女", 18),
      (3, "李四", "男", 20),
      (4, "李莉", "女", 23),
      (5, "倩倩", "女", 21)
    )
    val scoreDataSet = env.fromElements(
      (1, 90),
      (2, 84),
      (3, 80),
      (4, 92),
      (5, 87)
    )
    //where是指出左边DataSet的Join列,equalTo是指出右边DataSet的Join列
    val res = stuDataSet.join(scoreDataSet)
      .where(0)
      .equalTo(0)
    res.print()

  }
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值