flink 三、flink批处理

DataSource类型算子

    val environment: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
    // 支持多种Collection的具体类型
    val datasource1: DataSet[String] = environment.fromCollection(Array("a", "b", "c", "d"))
    // 支持Tuple,自定义对象等复合形式
    val datasource2: DataSet[String] = environment.fromElements("a", "b", "c", "d")
    // 基于迭代的sequence的dataset
    val datasource3: DataSet[Long] = environment.generateSequence(1, 20)
    // 基于本地文件的读取
    val datasource4: DataSet[String] = environment.readTextFile("word.txt")
    // 读取hdfs文件
    val datasource5: DataSet[String] = environment.readTextFile("hdfs://node01:9000/word.txt")
    // 读取csv文件,可以直接指定泛型
    class WordCount(word:String,num:Int)
    val datasource6: DataSet[WordCount] = environment.readCsvFile[WordCount]("word.csv")
    // 读取压缩文件,flink会自动识别出压缩类型,使用对应的方式进行解压
    val datasource7 = environment.readTextFile("word.tar.gz")

flink批处理transformation算子

ransformation说明
map将DataSet中的每一个元素转换为另外一个元素
flatMap将DataSet中的每一个元素转换为0…n个元素
mapPartition将一个分区中的元素转换为另一个元素
filter过滤出来一些符合条件的元素
reduce可以对一个dataset或者一个group来进行聚合计算,最终聚合成一个元素
reduceGroup将一个dataset或者一个group聚合成一个或多个元素
aggregate按照内置的方式来进行聚合。例如:SUM/MIN/MAX…
distinct去重
join将两个DataSet按照一定条件连接到一起,形成新的DataSet
union将两个DataSet取并集,并不会去重
rebalance让每个分区的数据均匀分布,避免数据倾斜
partitionByHash按照指定的key进行hash分区
sortPartition指定字段对分区中的数据进行排序

sum算子

  def main(args: Array[String]): Unit = {
    val environment = ExecutionEnvironment.getExecutionEnvironment
    val datasource: DataSet[String] = environment.fromCollection(Array("flink spark hive", "hive spark hbase", "spark kafka"))
    datasource.flatMap(_.split(" "))
      .filter(_.length>1)
      .map((_,1))
      .groupBy(0)
      .sum(1)
      .print()
  }

reduce算子

  def main(args: Array[String]): Unit = {
    val environment = ExecutionEnvironment.getExecutionEnvironment
    val datasource: DataSet[String] = environment.fromCollection(Array("flink spark hive", "hive spark hbase", "spark kafka"))
    datasource.flatMap(_.split(" "))
      .filter(_.length>1)
      .map((_,1))
      .groupBy(0)
      .reduce((a,b)=>(b._1,a._2+b._2))
      .print()
  }

aggregate

  def main(args: Array[String]): Unit = {
    val environment = ExecutionEnvironment.getExecutionEnvironment
    val datasource: DataSet[String] = environment.fromCollection(Array("flink spark hive", "hive spark hbase", "spark kafka"))
    datasource.flatMap(_.split(" "))
      .filter(_.length>1)
      .map((_,1))
      .groupBy(0)
      .aggregate(Aggregations.SUM,1)
      .print()
  }

累加器accumulator

  def main(args: Array[String]): Unit = {
    val environment = ExecutionEnvironment.getExecutionEnvironment
    val datasource: DataSet[String] = environment.fromCollection(Array("flink spark hive", "hive spark hbase", "spark kafka"))
    val res: DataSet[String] = datasource.flatMap(_.split(" "))
      .filter(_.length > 1)
      .map(new RichMapFunction[String, String] {
        // 累加器变量
        private val intCounter = new IntCounter()
        override def open(parameters: Configuration): Unit = {
          // 注册累加器,param1:累加器名称,param2累加器变量
          getRuntimeContext.addAccumulator("wordCounter", intCounter)
        }
        override def map(in: String): String = {
          // 累加器自增
          intCounter.add(1)
          in
        }
      })
    res.writeAsText("data/wordCount",FileSystem.WriteMode.OVERWRITE)
    val result: JobExecutionResult = environment.execute("WordCount")
    println(result.getAccumulatorResult("wordCounter"))
  }  

广播变量broadcast

  def main(args: Array[String]): Unit = {
    val env = ExecutionEnvironment.getExecutionEnvironment
    val score = env.fromCollection(List(("张三", 100),("李四", 500),("王五",1000)))
    val student = env.fromCollection(List(("001", "张三"), ("002", "李四"), ("003", "王五")))
    student.map(new RichMapFunction[(String,String),(String,String,Int)] {
      var list: List[(String, Int)] = null
      override def map(in: (String, String)): (String, String, Int) = {
        val sname = in._2
        val tuples = list.filter((x: (String, Int)) => sname == x._1)
        (in._1,in._2,tuples(0)._2)
      }
      override def open(parameters: Configuration): Unit = {
        import scala.collection.JavaConverters._
        list = getRuntimeContext.getBroadcastVariable[(String,Int)]("score").asScala.toList
      }
    }).withBroadcastSet(score,"score").print()
  }

分布式缓存文件

  def main(args: Array[String]): Unit = {
    // env
    val environment = ExecutionEnvironment.getExecutionEnvironment
    // load list
    val list: DataSet[String] = environment.fromCollection(List("a", "b", "c", "d"))
    // 注册文件 param1:文件路径,param2缓存名称
    environment.registerCachedFile("D://work//train_data//word.txt","word")
    // map open获取文件
    val result = list.map(new RichMapFunction[String, String] {
      override def open(parameters: Configuration): Unit = {
        val myFile = getRuntimeContext.getDistributedCache.getFile("word")
        val lines = FileUtils.readLines(myFile)
        val it = lines.iterator()
        while (it.hasNext) {
          val line = it.next();
          println("line:" + line)
        }
      }
      override def map(in: String): String = {
        in
      }
    })
    result.print()
  }
  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值