出错程序:
import java.sql.Timestamp
import java.text.SimpleDateFormat
import org.apache.flink.api.common.typeinfo.Types
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.sink.SinkFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.table.api.{EnvironmentSettings, Table}
import org.apache.flink.table.api.scala.StreamTableEnvironment
import org.apache.flink.table.descriptors.{Json, Kafka, Rowtime, Schema}
import org.apache.flink.table.functions.ScalarFunction
import org.apache.flink.types.Row
/**
* 举两个UV计算的场景:
*
* 1. 实时计算当天零点起,到当前时间的uv。
* 2. 实时计算当天每个小时的UV。0点...12点...24点
*
* 代码里面的案例,是可以用于生产中的吗?
* 假如数据量小可以直接使用,每秒数据量大的话,就比较麻烦。
* 因为你看group by后面的维度,只有当天date 这个维度,
* 这样就会导致计算状态超级集中而使得内存占用超大进而引发oom。
*
* 这种情况解决办法就是将状态打散,然后再次聚合即可,典型的分治思想。
*
*/
object ComputeUVDay {
def main(args: Array[String]): Unit = {
val fsSettings: EnvironmentSettings = EnvironmentSettings.newInstance().useOldPlanner().inStreamingMode().build()
val fsEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
fsEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
val tEnv: StreamTableEnvironment = StreamTableEnvironment.create(fsEnv,fsSettings)
//注册自定义函数
tEnv.registerFunction("DateUtil", new DateUtil)
// 注册source
tEnv.connect( //连接信息
new Kafka()
.version("0.11")
.topic("test")
.property("bootstrap.servers", "localhost:9092")
.property("group.id","test")
.startFromLatest()
)
.withFormat(
new Json()
.failOnMissingField(false)
.deriveSchema()
)
.withSchema(
new Schema()
.field("rowtime", Types.SQL_TIMESTAMP)
.rowtime(new Rowtime()
.timestampsFromField("eventtime")
.watermarksPeriodicBounded(2000)
)
.field("fruit", Types.STRING)
.field("number", Types.LONG)
)
.inAppendMode()
.registerTableSource("source")
// 注册sink
tEnv.connect(
new Kafka()
.version("0.11")
.topic("test_22")
.property("acks", "all")
.property("retries", "0")
.property("batch.size", "16384")
.property("linger.ms", "10")
.property("bootstrap.servers", "localhost:9092")
.sinkPartitionerFixed()
).inAppendMode() // 仅交互INSERT操作更新数据
.withFormat(
new Json().deriveSchema()
)
.withSchema(
new Schema()
.field("time1",Types.STRING)
.field("total", Types.LONG)
)
.registerTableSink("sink")
//计算天级别的uv
tEnv.sqlUpdate("insert into sink select DateUtil(rowtime) time1,count(distinct fruit) total from source group by DateUtil(rowtime)")
//计算小时级别的uv
// val table: Table = tEnv.sqlQuery("select DateUtil(rowtime,'yyyyMMddHH'),count(distinct fruit) from source group by DateUtil(rowtime,'yyyyMMddHH')")
//将table通过toRetractStream方法转换成Row格式的DataStream
//返回结果类型为(Boolean, Row)
//可以根据第一个字段是否为True判断是插入还是删除引起的更新数据
//tEnv.toRetractStream[Row](table).addSink(new MySink)
fsEnv.execute(this.getClass.getSimpleName)
}
}
/**
* 自定义UDF
*/
class DateUtil extends ScalarFunction {
def eval(timestamp:Long): String = {
val sdf = new SimpleDateFormat("yyyy-MM-dd")
sdf.format(new Timestamp(timestamp))
}
def eval(timestamp:Long, format:String): Unit = {
val sdf = new SimpleDateFormat(format)
sdf.format(new Timestamp(timestamp))
}
}
/**
* 自定义输出sink
*/
class MySink extends SinkFunction[(Boolean, Row)] {
override def invoke(value: (Boolean, Row), context: SinkFunction.Context[_]): Unit = {
println(value._1.toString)
}
}
就是一个kafka的数据,聚合计算之后往另一个kafka写的操作
报错详细信息:
Exception in thread "main" org.apache.flink.table.api.TableException: AppendStreamTableSink requires that Table has only insert changes.
at org.apache.flink.table.planner.StreamPlanner.writeToAppendSink(StreamPlanner.scala:329)
at org.apache.flink.table.planner.StreamPlanner.org$apache$flink$table$planner$StreamPlanner$$writeToSink(StreamPlanner.scala:285)
at org.apache.flink.table.planner.StreamPlanner$$anonfun$2.apply(StreamPlanner.scala:169)
at org.apache.flink.table.planner.StreamPlanner$$anonfun$2.apply(StreamPlanner.scala:155)
at scala.Option.map(Option.scala:146)
at org.apache.flink.table.planner.StreamPlanner.org$apache$flink$table$planner$StreamPlanner$$translate(StreamPlanner.scala:155)
at org.apache.flink.table.planner.StreamPlanner$$anonfun$translate$1.apply(StreamPlanner.scala:127)
at org.apache.flink.table.planner.StreamPlanner$$anonfun$translate$1.apply(StreamPlanner.scala:127)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:234)
at scala.collection.Iterator$class.foreach(Iterator.scala:891)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1334)
at scala.collection.IterableLike$class.foreach(IterableLike.scala:72)
at scala.collection.AbstractIterable.foreach(Iterable.scala:54)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:234)
at scala.collection.AbstractTraversable.map(Traversable.scala:104)
at org.apache.flink.table.planner.StreamPlanner.translate(StreamPlanner.scala:127)
at org.apache.flink.table.api.internal.TableEnvironmentImpl.translate(TableEnvironmentImpl.java:439)
at org.apache.flink.table.api.internal.TableEnvironmentImpl.sqlUpdate(TableEnvironmentImpl.java:348)
at yunqing.table.daywindow.ComputeUVDay$.main(ComputeUVDay.scala:98)
at yunqing.table.daywindow.ComputeUVDay.main(ComputeUVDay.scala)
意思就是 AppendStreamTableSink 需要表只有插入(不能update),去掉表上面的groupBy(),就不会报错了。。。
把sql调整为:
tEnv.sqlUpdate("insert into sink select DateUtil(rowtime) time1, fruit total from source")
程序运行正常
注意: group 字段的 count 值变化的时候,会产生两条数据,一条是旧数据,带着false标示,一条是新数据,带着true标示
但是如果就是必须要执行groupBy操作,该怎么办呢?
换个sink端,要支持更新的sink
kafka是一个消息队列,仅仅支持append, 不支持更新和删除