Flink代码之Table API 和Flink SQL(十)

一、flink sql 和table api的案例(入门)

1.文件流转成表,动态流转成表

import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.scala._
import org.apache.flink.api.scala._
import org.apache.flink.table.api.{DataTypes, EnvironmentSettings, Table}
import org.apache.flink.table.descriptors.{Csv, FileSystem, Schema}

object FlinkTableExample {
  def main(args: Array[String]): Unit = {
    // 创建执行环境
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)

    // 创建表环境, 使用blink planner

    val settings = EnvironmentSettings
      .newInstance()
      // 流批统一
      .useBlinkPlanner()
      .inStreamingMode()
      .build()

    val tableEnv = StreamTableEnvironment.create(env, settings)

    tableEnv
      .connect(new FileSystem().path("/Users/yuanzuo/Desktop/flink-tutorial/FlinkSZ1128/src/main/resources/sensor.txt"))
      .withFormat(new Csv()) // 按照csv文件格式解析文件
      .withSchema( // 定义表结构
        new Schema()
          .field("id", DataTypes.STRING())
          .field("timestamp", DataTypes.BIGINT())
          .field("temperature", DataTypes.DOUBLE())
      )
      .createTemporaryTable("inputTable")    // 创建临时表

    val sensorTable: Table = tableEnv.from("inputTable") // 将临时表inputTable赋值到sensorTable
    // 使用table api
    val resultTable: Table = sensorTable
      .select("id, temperature") // 查询`id`, `temperature` => (String, Double)
      .filter("id = 'sensor_1'") // 过滤

    resultTable
        .toAppendStream[(String, Double)] // 追加流
        .print()

    // 使用flink sql的方式查询
    val resultSqlTable: Table = tableEnv
      .sqlQuery("select id, temperature from inputTable where id ='sensor_1'")

    resultSqlTable
      .toAppendStream[(String, Double)] // 追加流
//      .print()

    // 将DataStream转换成流表

    val stream = env.addSource(new SensorSource)

    val table = tableEnv.fromDataStream(stream, 'id, 'timestamp as 'ts, 'temperature as 'temp)
    table
      .select('id, 'temp)
      .toAppendStream[(String, Double)]
      .print()

    env.execute()
  }
}

二、滚动窗口,统计10秒内出现的每个sensor的个数

1.Table Api实现

import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.scala._
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.table.api.{EnvironmentSettings, Tumble}

/**
 * 滚动窗口,统计10秒内出现的每个sensor的个数
 */
object CountTempByTableApi {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val stream = env
      .addSource(new SensorSource)
      .assignAscendingTimestamps(_.timestamp)

    // 表相关代码
    val settings = EnvironmentSettings
      .newInstance()
      .useBlinkPlanner()
      .inStreamingMode()
      .build()

    val tableEnv = StreamTableEnvironment.create(env, settings)

    // 将流转换成动态表
    val dataTable = tableEnv
      // rowtime表示使用事件时间,as表示取别名
      .fromDataStream(stream, 'id, 'timestamp.rowtime as 'ts, 'temperature as 'temp)
      // 滚动窗口
      // 取ts作为抽取的事件时间,别名取为w
      .window(Tumble over 10.seconds on 'ts as 'w)
      .groupBy('id, 'w) // keyby.timeWindow
      .select('id, 'id.count) // 每个窗口有多少条数据

    // 将动态表转换成流
    dataTable
      .toRetractStream[(String, Long)] // `id, id.count`; 撤回流
      .print()

    env.execute()
  }
}

2.Flink SQL实现

import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.scala._
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.table.api.{EnvironmentSettings, Tumble}

object CountTempBySQL {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val stream = env
      .addSource(new SensorSource)
      .assignAscendingTimestamps(_.timestamp)

    // 表相关代码
    val settings = EnvironmentSettings
      .newInstance()
      .useBlinkPlanner()
      .inStreamingMode()
      .build()

    val tableEnv = StreamTableEnvironment.create(env, settings)

    // 将流转换成动态表
    // dataTable 这个是表名
    val dataTable = tableEnv
      .fromDataStream(stream, 'id, 'timestamp.rowtime as 'ts, 'temperature as 'temp)

    val result = tableEnv
      // TUMBLE这个是滚动窗口,HOP是滑动窗口(或者叫跳动窗口)
      .sqlQuery("SELECT id, COUNT(id) FROM " + dataTable + " GROUP BY id, TUMBLE(ts, INTERVAL '10' SECOND)")
      .toRetractStream[(String, Long)]
    result.print() // DataStream才能打印

    env.execute()
  }
}

三、leftOuterJoinLateral 案例

1.Table Api实现,Flink SQL实现都在里面

import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.scala._
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.table.api.EnvironmentSettings
import org.apache.flink.table.functions.TableFunction
import org.apache.flink.types.Row

object TableFunctionExample {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    // 在主流上设置并行度1
    env.setParallelism(1)
    // 设置事件时间
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val stream = env
      .fromElements("hello#world", "atguigu#zuoyuan")

    // 表相关代码
    val settings = EnvironmentSettings
      .newInstance()
      .useBlinkPlanner()
      .inStreamingMode()
      .build()

    // 创建表环境
    val tableEnv = StreamTableEnvironment.create(env, settings)

    val split = new Split("#")

    // 这个s,代表流中"hello#world", "atguigu#zuoyuan"这两个字符串
    val dataTable = tableEnv.fromDataStream(stream, 's)

    dataTable
        // 为什么要进行join?因为要将s和split(s) join到一行
//        .joinLateral(split('s) as ('word, 'length))
      // 功能一样
        .leftOuterJoinLateral(split('s) as ('word, 'length))
        .select('s, 'word, 'length)
        .toAppendStream[(String, String, Int)]
//        .print()


    // 2.Flink SQL实现,注册udf函数
    tableEnv.registerFunction("split", new Split("#"))
    tableEnv.createTemporaryView("t", dataTable)

    tableEnv
        // `T`是元组的意思
        .sqlQuery(
          """
            |SELECT s, word, length from
            | t
            | LEFT JOIN LATERAL TABLE(split(s)) AS T(word, length) ON TRUE""".stripMargin)
        .toAppendStream[Row]
        .print()

    env.execute()
  }

  class Split(separator: String) extends TableFunction[(String, Int)] {
    def eval(str: String): Unit = {
      str.split(separator).foreach(word => collect((word, word.length)))
    }
  }
}

四、标量函数(Scalar Functions)

1.标量函数的定义

用户定义的标量函数,可以将0、1或多个标量值,映射到新的标量值。

为了定义标量函数,必须在org.apache.flink.table.functions中扩展基类Scalar Function,并实现(一个或多个)求值(evaluation,eval)方法。标量函数的行为由求值方法决定,求值方法必须公开声明并命名为eval(直接def声明,没有override)。求值方法的参数类型和返回类型,确定了标量函数的参数和返回类型。

在下面的代码中,我们定义自己的HashCode函数,在TableEnvironment中注册它,并在查询中调用它。

import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.scala._
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.table.api.{EnvironmentSettings, Tumble}
import org.apache.flink.table.functions.ScalarFunction

object ScalarFunctionExample {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

    val stream = env
      .addSource(new SensorSource)
      .assignAscendingTimestamps(_.timestamp)

    // 表相关代码
    val settings = EnvironmentSettings
      .newInstance()
      .useBlinkPlanner()
      .inStreamingMode()
      .build()

    val tableEnv = StreamTableEnvironment.create(env, settings)

    val hashCode = new HashCode(10)


    // 将流转换成动态表
    val dataTable = tableEnv
      .fromDataStream(stream, 'id, 'timestamp.rowtime as 'ts, 'temperature as 'temp) // rowtime表示事件时间

    dataTable
        .select('id, hashCode('id))
        .toAppendStream[(String, Int)]
//        .print()


    // 注册udf函数
    tableEnv.registerFunction("hashCode", new HashCode(10))

//    tableEnv.createTemporaryView("t", dataTable, 'id)

    tableEnv
        .sqlQuery("SELECT id, hashCode(id) FROM " + dataTable)
        .toAppendStream[(String, Int)]
        .print()


    env.execute()
  }

  class HashCode(val factor: Int) extends ScalarFunction {
    def eval(s: String): Int = {
      s.hashCode() * factor
    }
  }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值