先说下模拟场景吧,统计日志数据中每分钟的IP访问量,取出访问量最大的五个值,每五秒更新一次
解决思路:
1、将日志中的时间转换为时间戳,作为EventTime
2、所以调用assignTimestampsAndWatermarks将排序EventTime,并设置延迟watermark为60S
3、构建滑动窗口
4、自定义预聚合函数aggregate将数据转换为UrlViewCount格式
5、自定义转换算子process将数据转换输出
数据如下,需要代码的朋友留邮件即可
83.149.9.216 - - 17/05/2015:10:05:03 +0000 GET /presentations/logstash-monitorama-2013/images/kibana-search.png
83.149.9.216 - - 17/05/2015:10:05:43 +0000 GET /presentations/logstash-monitorama-2013/images/kibana-dashboard3.png
83.149.9.216 - - 17/05/2015:10:05:47 +0000 GET /presentations/logstash-monitorama-2013/plugin/highlight/highlight.js
83.149.9.216 - - 17/05/2015:10:05:12 +0000 GET /presentations/logstash-monitorama-2013/plugin/zoom-js/zoom.js
83.149.9.216 - - 17/05/2015:10:05:07 +0000 GET /presentations/logstash-monitorama-2013/plugin/notes/notes.js
83.149.9.216 - - 17/05/2015:10:05:34 +0000 GET /presentations/logstash-monitorama-2013/images/sad-medic.png
83.149.9.216 - - 17/05/2015:10:05:57 +0000 GET /presentations/logstash-monitorama-2013/css/fonts/Roboto-Bold.ttf
83.149.9.216 - - 17/05/2015:10:05:50 +0000 GET /presentations/logstash-monitorama-2013/css/fonts/Roboto-Regular.ttf
83.149.9.216 - - 17/05/2015:10:05:24 +0000 GET /presentations/logstash-monitorama-2013/images/frontend-response-codes.png
83.149.9.216 - - 17/05/2015:10:05:50 +0000 GET /presentations/logstash-monitorama-2013/images/kibana-dashboard.png
83.149.9.216 - - 17/05/2015:10:05:46 +0000 GET /presentations/logstash-monitorama-2013/images/Dreamhost_logo.svg
83.149.9.216 - - 17/05/2015:10:05:11 +0000 GET /presentations/logstash-monitorama-2013/images/kibana-dashboard2.png
83.149.9.216 - - 17/05/2015:10:05:19 +0000 GET /presentations/logstash-monitorama-2013/images/apache-icon.gif
代码如下详情——>注释
package com.thoughtworks.log
import java.sql.Timestamp
import java.text.SimpleDateFormat
import java.util
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.configuration.Configuration
import org.apache.flink.contrib.streaming.state.{RocksDBOptions, RocksDBStateBackend}
import org.apache.flink.core.fs.Path
import org.apache.flink.runtime.state.StateBackend
import org.apache.flink.runtime.state.filesystem.FsStateBackend
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.{Collector, TernaryBoolean}
import scala.collection.mutable.ListBuffer
// 输入日志样例类
case class LogEvent(ip: String, userName: String, eventTime: Long, method: String, url: String)
// 中间统计数量的数据类型
case class UrlViewCount(url: String, windowEnd: Long, count: Long)
object NetworkLog {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// 设置时间为EventTime 日志发生时间
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
//TODO FSImage
val checkPointPath = new Path("file:///fs/checkpoints")
val fsStateBackend: StateBackend = new FsStateBackend(checkPointPath)
env.setStateBackend(fsStateBackend)
// TODO RocksDB有点问题暂时不可用
// val checkpointDataUri = "file:///fs/checkpoints"
// val tmpDir = "file:///rocksDB/checkpoints"
// val fsStateBackend: StateBackend = new FsStateBackend(checkpointDataUri)
// val rocksDBBackend: RocksDBStateBackend = new RocksDBStateBackend(fsStateBackend, TernaryBoolean.TRUE)
// val config = new Configuration()
// //TIMER分为HEAP(默认,性能更好)和RocksDB(扩展好)
// config.setString(RocksDBOptions.TIMER_SERVICE_FACTORY, RocksDBStateBackend.PriorityQueueStateType.ROCKSDB.toString)
// rocksDBBackend.configure(config)
// rocksDBBackend.setDbStoragePath(tmpDir)
// env.setStateBackend(rocksDBBackend.asInstanceOf[StateBackend])
env.readTextFile("data/apache.log")
.map(line => {
val strs: Array[String] = line.split(" ")
// 把log时间转换为时间戳
val simpleDateFormat = new SimpleDateFormat("dd/MM/yyyy:HH:mm:ss")
// 已经是毫秒了
val timestamp = simpleDateFormat.parse(strs(3)).getTime
LogEvent(strs(0), strs(2), timestamp, strs(5), strs(6))
})
// watermark延迟为60S
.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor[LogEvent](Time.seconds(60)) {
override def extractTimestamp(element: LogEvent): Long = element.eventTime
})
.keyBy(_.url) // 根据Url分组
.timeWindow(Time.minutes(1), Time.seconds(5)) // 窗口长度为60S,5s滚动
.aggregate(new CountAgg(), new WindowResultFun()) // 自定义预聚合函数
.keyBy(_.windowEnd) // 根据窗口来分组
.process(new TopURL(5)) // 自定义转换算子
.print("Log test")
.setParallelism(1)
env.execute("haha")
}
}
/**
* 预聚合操作,来一条数据就计数器加1
* LogEvent 输入类型
* Long 聚合类型
* Long 输出类型
*/
class CountAgg() extends AggregateFunction[LogEvent, Long, Long] {
// 创建的时候计数为0
override def createAccumulator(): Long = 0L
// 来一个就增加1
override def add(value: LogEvent, accumulator: Long): Long = accumulator + 1
// 返回结果
override def getResult(accumulator: Long): Long = accumulator
// 聚合
override def merge(a: Long, b: Long): Long = a + b
}
/**
* 窗口关闭时的操作,包装成UrlViewCount
* Long 输入
* UrlViewCount 输出
* String key=> url
* TimeWindow 窗口
*/
class WindowResultFun() extends WindowFunction[Long, UrlViewCount, String, TimeWindow] {
override def apply(key: String, window: TimeWindow, input: Iterable[Long], out: Collector[UrlViewCount]): Unit = {
out.collect(UrlViewCount(key, window.getEnd, input.iterator.next()))
}
}
/**
* 自定义转换算子
* Long 通过那个keyby的 时间戳
* UrlViewCount 输入
* String 输出
*/
class TopURL(size: Int) extends KeyedProcessFunction[Long, UrlViewCount, String] {
// 懒加载方式定义state
lazy val urlState: ListState[UrlViewCount] = getRuntimeContext.getListState(new ListStateDescriptor[UrlViewCount]("url_state", classOf[UrlViewCount]))
// 来一条数据塞一条数据进去
override def processElement(value: UrlViewCount, ctx: KeyedProcessFunction[Long, UrlViewCount, String]#Context, out: Collector[String]): Unit = {
urlState.add(value)
// 注册定时器 定时器触发时,应该收集到了所有数据
ctx.timerService().registerEventTimeTimer(value.windowEnd + 100)
}
// 定时器触发之后执行onTimer方法
override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, UrlViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {
// allUrlViewCounts装urlState里面的所有数据
val allUrlViewCounts: ListBuffer[UrlViewCount] = ListBuffer()
val iter: util.Iterator[UrlViewCount] = urlState.get().iterator()
while (iter.hasNext) {
allUrlViewCounts.append(iter.next())
}
// 清空urlState 避免重复计算
urlState.clear()
// 按照点击量大小排序
val sortedUrlViewCounts = allUrlViewCounts.sortWith(_.count > _.count).take(size)
// 把结果格式化为string输出
val result: StringBuilder = new StringBuilder
result.append("====================================\n")
result.append("时间: ").append(new Timestamp(timestamp - 100)).append("\n")
for (i <- sortedUrlViewCounts.indices) {
val currentUrlView: UrlViewCount = sortedUrlViewCounts(i)
// e.g. No1: URL=/blog/tags/firefox?flav=rss20 流量=55
result.append("No").append(i + 1).append(":")
.append(" URL=").append(currentUrlView.url)
.append(" 流量=").append(currentUrlView.count).append("\n")
}
result.append("====================================\n\n")
Thread.sleep(500)
out.collect(result.toString())
}
}