package com. gu.networkflow_analysis
import java.sql.Timestamp
import java.text.SimpleDateFormat
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor, MapState, MapStateDescriptor}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import scala.collection.mutable.ListBuffer
/**
- Copyright © 2018-2028 All Rights Reserved
- Project: UserBehaviorAnalysis
- Package: com. gu.networkflow_analysis
- Version: 1.0
- Created by wushengran on 2019/9/23 9:21
*/
// 输入数据样例类
case class ApacheLogEvent( ip: String, userId: String, eventTime: Long, method: String, url: String)
// 窗口聚合结果样例类
case class UrlViewCount( url: String, windowEnd: Long, count: Long )
object NetworkFlow {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
// val dataStream = env.readTextFile(“D:\Projects\BigData\UserBehaviorAnalysis\NetworkFlowAnalysis\src\main\resources\apache.log”)
val dataStream = env.socketTextStream(“localhost”, 7777)
.map( data => {
val dataArray = data.split(" ")
// 定义时间转换
val simpleDateFormat = new SimpleDateFormat(“dd/MM/yyyy:HH:mm:ss”)
val timestamp = simpleDateFormat.parse(dataArray(3).trim).getTime
ApacheLogEvent( dataArray(0).trim, dataArray(1).trim, timestamp, dataArray(5).trim, dataArray(6).trim )
} )
.assignTimestampsAndWatermarks( new BoundedOutOfOrdernessTimestampExtractorApacheLogEvent {
override def extractTimestamp(element: ApacheLogEvent): Long = element.eventTime
} )
.keyBy(_.url)
.timeWindow(Time.minutes(10), Time.seconds(5))
.allowedLateness(Time.seconds(60))
.aggregate( new CountAgg(), new WindowResult() )
val processedStream = dataStream
.keyBy(_.windowEnd)
.process( new TopNHotUrls(5) )
dataStream.print("aggregate")
processedStream.print("process")
env.execute("network flow job")
}
}
// 自定义预聚合函数
class CountAgg() extends AggregateFunction[ApacheLogEvent, Long, Long]{
override def add(value: ApacheLogEvent, accumulator: Long): Long = accumulator + 1
override def createAccumulator(): Long = 0L
override def getResult(accumulator: Long): Long = accumulator
override def merge(a: Long, b: Long): Long = a + b
}
// 自定义窗口处理函数
class WindowResult() extends WindowFunction[Long, UrlViewCount, String, TimeWindow]{
override def apply(key: String, window: TimeWindow, input: Iterable[Long], out: Collector[UrlViewCount]): Unit = {
out.collect( UrlViewCount( key, window.getEnd, input.iterator.next() ) )
}
}
// 自定义排序输出处理函数
class TopNHotUrls(topSize: Int) extends KeyedProcessFunction[Long, UrlViewCount, String]{
lazy val urlState: MapState[String, Long] = getRuntimeContext.getMapState( new MapStateDescriptor[String, Long](“url-state”, classOf[String], classOf[Long] ) )
override def processElement(value: UrlViewCount, ctx: KeyedProcessFunction[Long, UrlViewCount, String]#Context, out: Collector[String]): Unit = {
urlState.put(value.url, value.count)
ctx.timerService().registerEventTimeTimer(value.windowEnd + 1)
}
override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, UrlViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {
// 从状态中拿到数据
val allUrlViews: ListBuffer[(String, Long)] = new ListBuffer(String, Long)
val iter = urlState.entries().iterator()
while(iter.hasNext){
val entry = iter.next()
allUrlViews += (( entry.getKey, entry.getValue ))
}
// urlState.clear()
val sortedUrlViews = allUrlViews.sortWith(_._2 > _._2).take(topSize)
// 格式化结果输出
val result: StringBuilder = new StringBuilder()
result.append("时间:").append( new Timestamp( timestamp - 1 ) ).append("\n")
for( i <- sortedUrlViews.indices ){
val currentUrlView = sortedUrlViews(i)
result.append("NO").append(i + 1).append(":")
.append(" URL=").append(currentUrlView._1)
.append(" 访问量=").append(currentUrlView._2).append("\n")
}
result.append("=============================")
Thread.sleep(1000)
out.collect(result.toString())
}
}
package com. gu.networkflow_analysis
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time
/**
- Copyright © 2018-2028 All Rights Reserved
- Project: UserBehaviorAnalysis
- Package: com. gu.networkflow_analysis
- Version: 1.0
- Created by wushengran on 2019/9/23 10:28
*/
// 定义输入数据的样例类
case class UserBehavior( userId: Long, itemId: Long, categoryId: Int, behavior: String, timestamp: Long )
object PageView {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
// 用相对路径定义数据源
val resource = getClass.getResource("/UserBehavior.csv")
val dataStream = env.readTextFile(resource.getPath)
.map( data => {
val dataArray = data.split(",")
UserBehavior( dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong )
} )
.assignAscendingTimestamps(_.timestamp * 1000L)
.filter( _.behavior == "pv" ) // 只统计pv操作
.map( data => ("pv", 1) )
.keyBy(_._1)
.timeWindow(Time.hours(1))
.sum(1)
dataStream.print("pv count")
env.execute("page view jpb")
}
}
package com. gu.networkflow_analysis
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.AllWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
/**
- Copyright © 2018-2028 All Rights Reserved
- Project: UserBehaviorAnalysis
- Package: com. gu.networkflow_analysis
- Version: 1.0
- Created by wushengran on 2019/9/23 10:43
*/
case class UvCount( windowEnd: Long, uvCount: Long )
object UniqueVisitor {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
// 用相对路径定义数据源
val resource = getClass.getResource("/UserBehavior.csv")
val dataStream = env.readTextFile(resource.getPath)
.map( data => {
val dataArray = data.split(",")
UserBehavior( dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong )
} )
.assignAscendingTimestamps(_.timestamp * 1000L)
.filter( _.behavior == "pv" ) // 只统计pv操作
.timeWindowAll( Time.hours(1) )
.apply( new UvCountByWindow() )
dataStream.print()
env.execute("uv job")
}
}
class UvCountByWindow() extends AllWindowFunction[UserBehavior, UvCount, TimeWindow]{
override def apply(window: TimeWindow, input: Iterable[UserBehavior], out: Collector[UvCount]): Unit = {
// 定义一个scala set,用于保存所有的数据userId并去重
var idSet = SetLong
// 把当前窗口所有数据的ID收集到set中,最后输出set的大小
for( userBehavior <- input ){
idSet += userBehavior.userId
}
out.collect( UvCount( window.getEnd, idSet.size ) )
}
}
package com. gu.networkflow_analysis
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.triggers.{Trigger, TriggerResult}
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import redis.clients.jedis.Jedis
/**
-
Copyright © 2018-2028 All Rights Reserved
-
Project: UserBehaviorAnalysis
-
Package: com. gu.networkflow_analysis
-
Version: 1.0
-
Created by wushengran on 2019/9/23 11:34
*/
object UvWithBloom {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)// 用相对路径定义数据源
val resource = getClass.getResource("/UserBehavior.csv")
val dataStream = env.readTextFile(resource.getPath)
.map(data => {
val dataArray = data.split(",")
UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
})
.assignAscendingTimestamps(.timestamp * 1000L)
.filter(.behavior == “pv”) // 只统计pv操作
.map(data => (“dummyKey”, data.userId))
.keyBy(_._1)
.timeWindow(Time.hours(1))
.trigger(new MyTrigger())
.process(new UvCountWithBloom())dataStream.print()
env.execute(“uv with bloom job”)
}
}
// 自定义窗口触发器
class MyTrigger() extends Trigger[(String, Long), TimeWindow] {
override def onEventTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = TriggerResult.CONTINUE
override def onProcessingTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = TriggerResult.CONTINUE
override def clear(window: TimeWindow, ctx: Trigger.TriggerContext): Unit = {}
override def onElement(element: (String, Long), timestamp: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = {
// 每来一条数据,就直接触发窗口操作,并清空所有窗口状态
TriggerResult.FIRE_AND_PURGE
}
}
// 定义一个布隆过滤器
class Bloom(size: Long) extends Serializable {
// 位图的总大小,默认16M
private val cap = if (size > 0) size else 1 << 27
// 定义hash函数
def hash(value: String, seed: Int): Long = {
var result = 0L
for( i <- 0 until value.length ){
result = result * seed + value.charAt(i)
}
result & ( cap - 1 )
}
}
class UvCountWithBloom() extends ProcessWindowFunction[(String, Long), UvCount, String, TimeWindow]{
// 定义redis连接
lazy val jedis = new Jedis(“localhost”, 6379)
lazy val bloom = new Bloom(1<<29)
override def process(key: String, context: Context, elements: Iterable[(String, Long)], out: Collector[UvCount]): Unit = {
// 位图的存储方式,key是windowEnd,value是bitmap
val storeKey = context.window.getEnd.toString
var count = 0L
// 把每个窗口的uv count值也存入名为count的redis表,存放内容为(windowEnd -> uvCount),所以要先从redis中读取
if( jedis.hget(“count”, storeKey) != null ){
count = jedis.hget(“count”, storeKey).toLong
}
// 用布隆过滤器判断当前用户是否已经存在
val userId = elements.last._2.toString
val offset = bloom.hash(userId, 61)
// 定义一个标识位,判断reids位图中有没有这一位
val isExist = jedis.getbit(storeKey, offset)
if(!isExist){
// 如果不存在,位图对应位置1,count + 1
jedis.setbit(storeKey, offset, true)
jedis.hset(“count”, storeKey, (count + 1).toString)
out.collect( UvCount(storeKey.toLong, count + 1) )
} else {
out.collect( UvCount(storeKey.toLong, count) )
}
}
}
package com. gu.marketanalysis
import java.sql.Timestamp
import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
/**
- Copyright © 2018-2028 All Rights Reserved
- Project: UserBehaviorAnalysis
- Package: com. gu.marketanalysis
- Version: 1.0
- Created by wushengran on 2019/9/24 10:10
*/
// 输入的广告点击事件样例类
case class AdClickEvent( userId: Long, adId: Long, province: String, city: String, timestamp: Long )
// 按照省份统计的输出结果样例类
case class CountByProvince( windowEnd: String, province: String, count: Long )
// 输出的黑名单报警信息
case class BlackListWarning( userId: Long, adId: Long, msg: String )
object AdStatisticsByGeo {
// 定义侧输出流的tag
val blackListOutputTag: OutputTag[BlackListWarning] = new OutputTagBlackListWarning
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)
// 读取数据并转换成AdClickEvent
val resource = getClass.getResource("/AdClickLog.csv")
val adEventStream = env.readTextFile(resource.getPath)
.map( data => {
val dataArray = data.split(",")
AdClickEvent( dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim, dataArray(3).trim, dataArray(4).trim.toLong )
} )
.assignAscendingTimestamps(_.timestamp * 1000L)
// 自定义process function,过滤大量刷点击的行为
val filterBlackListStream = adEventStream
.keyBy( data => (data.userId, data.adId) )
.process( new FilterBlackListUser(100) )
// 根据省份做分组,开窗聚合
val adCountStream = filterBlackListStream
.keyBy(_.province)
.timeWindow( Time.hours(1), Time.seconds(5) )
.aggregate( new AdCountAgg(), new AdCountResult() )
adCountStream.print("count")
filterBlackListStream.getSideOutput(blackListOutputTag).print("blacklist")
env.execute("ad statistics job")
}
class FilterBlackListUser(maxCount: Int) extends KeyedProcessFunction[(Long, Long), AdClickEvent, AdClickEvent]{
// 定义状态,保存当前用户对当前广告的点击量
lazy val countState: ValueState[Long] = getRuntimeContext.getState(new ValueStateDescriptor[Long](“count-state”, classOf[Long]))
// 保存是否发送过黑名单的状态
lazy val isSentBlackList: ValueState[Boolean] = getRuntimeContext.getState( new ValueStateDescriptor[Boolean](“issent-state”, classOf[Boolean]) )
// 保存定时器触发的时间戳
lazy val resetTimer: ValueState[Long] = getRuntimeContext.getState( new ValueStateDescriptor[Long](“resettime-state”, classOf[Long]) )
override def processElement(value: AdClickEvent, ctx: KeyedProcessFunction[(Long, Long), AdClickEvent, AdClickEvent]#Context, out: Collector[AdClickEvent]): Unit = {
// 取出count状态
val curCount = countState.value()
// 如果是第一次处理,注册定时器,每天00:00触发
if( curCount == 0 ){
val ts = ( ctx.timerService().currentProcessingTime()/(1000*60*60*24) + 1) * (1000*60*60*24)
resetTimer.update(ts)
ctx.timerService().registerProcessingTimeTimer(ts)
}
// 判断计数是否达到上限,如果到达则加入黑名单
if( curCount >= maxCount ){
// 判断是否发送过黑名单,只发送一次
if( !isSentBlackList.value() ){
isSentBlackList.update(true)
// 输出到侧输出流
ctx.output( blackListOutputTag, BlackListWarning(value.userId, value.adId, "Click over " + maxCount + " times today.") )
}
return
}
// 计数状态加1,输出数据到主流
countState.update( curCount + 1 )
out.collect( value )
}
override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[(Long, Long), AdClickEvent, AdClickEvent]#OnTimerContext, out: Collector[AdClickEvent]): Unit = {
// 定时器触发时,清空状态
if( timestamp == resetTimer.value() ){
isSentBlackList.clear()
countState.clear()
resetTimer.clear()
}
}
}
}
// 自定义预聚合函数