flinkScala

最新推荐文章于 2024-08-05 11:29:42 发布

给我一个苹果

最新推荐文章于 2024-08-05 11:29:42 发布

阅读量2.8k

点赞数

文章标签： apache scala java

本文链接：https://blog.csdn.net/weixin_43705952/article/details/121801028

版权

本文档展示了如何使用Apache Flink和Scala进行网络流量分析。通过实例代码，实现了Apache日志事件的处理，包括数据转换、时间戳处理、窗口聚合以及自定义的预聚合函数和窗口处理函数，旨在统计URL的访问次数和实现唯一访客计数。

摘要由CSDN通过智能技术生成

package com. gu.networkflow_analysis

import java.sql.Timestamp
import java.text.SimpleDateFormat

import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor, MapState, MapStateDescriptor}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

import scala.collection.mutable.ListBuffer

/**

Copyright © 2018-2028 All Rights Reserved
Project: UserBehaviorAnalysis
Package: com. gu.networkflow_analysis
Version: 1.0
Created by wushengran on 2019/9/23 9:21
*/

// 输入数据样例类
case class ApacheLogEvent( ip: String, userId: String, eventTime: Long, method: String, url: String)

// 窗口聚合结果样例类
case class UrlViewCount( url: String, windowEnd: Long, count: Long )

object NetworkFlow {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setParallelism(1)
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)

// val dataStream = env.readTextFile(“D:\Projects\BigData\UserBehaviorAnalysis\NetworkFlowAnalysis\src\main\resources\apache.log”)
val dataStream = env.socketTextStream(“localhost”, 7777)
.map( data => {
val dataArray = data.split(" ")
// 定义时间转换
val simpleDateFormat = new SimpleDateFormat(“dd/MM/yyyy:HH:mm:ss”)
val timestamp = simpleDateFormat.parse(dataArray(3).trim).getTime
ApacheLogEvent( dataArray(0).trim, dataArray(1).trim, timestamp, dataArray(5).trim, dataArray(6).trim )
} )
.assignTimestampsAndWatermarks( new BoundedOutOfOrdernessTimestampExtractorApacheLogEvent {
override def extractTimestamp(element: ApacheLogEvent): Long = element.eventTime
} )
.keyBy(_.url)
.timeWindow(Time.minutes(10), Time.seconds(5))
.allowedLateness(Time.seconds(60))
.aggregate( new CountAgg(), new WindowResult() )

val processedStream = dataStream
  .keyBy(_.windowEnd)
  .process( new TopNHotUrls(5) )

dataStream.print("aggregate")
processedStream.print("process")

env.execute("network flow job")

}
}

// 自定义预聚合函数
class CountAgg() extends AggregateFunction[ApacheLogEvent, Long, Long]{
override def add(value: ApacheLogEvent, accumulator: Long): Long = accumulator + 1

override def createAccumulator(): Long = 0L

override def getResult(accumulator: Long): Long = accumulator

override def merge(a: Long, b: Long): Long = a + b
}

// 自定义窗口处理函数
class WindowResult() extends WindowFunction[Long, UrlViewCount, String, TimeWindow]{
override def apply(key: String, window: TimeWindow, input: Iterable[Long], out: Collector[UrlViewCount]): Unit = {
out.collect( UrlViewCount( key, window.getEnd, input.iterator.next() ) )
}
}

// 自定义排序输出处理函数
class TopNHotUrls(topSize: Int) extends KeyedProcessFunction[Long, UrlViewCount, String]{
lazy val urlState: MapState[String, Long] = getRuntimeContext.getMapState( new MapStateDescriptor[String, Long](“url-state”, classOf[String], classOf[Long] ) )

override def processElement(value: UrlViewCount, ctx: KeyedProcessFunction[Long, UrlViewCount, String]#Context, out: Collector[String]): Unit = {
urlState.put(value.url, value.count)
ctx.timerService().registerEventTimeTimer(value.windowEnd + 1)
}

override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, UrlViewCount, String]#OnTimerContext, out: Collector[String]): Unit = {
// 从状态中拿到数据
val allUrlViews: ListBuffer[(String, Long)] = new ListBuffer(String, Long)
val iter = urlState.entries().iterator()
while(iter.hasNext){
val entry = iter.next()
allUrlViews += (( entry.getKey, entry.getValue ))
}

// urlState.clear()

val sortedUrlViews = allUrlViews.sortWith(_._2 > _._2).take(topSize)

// 格式化结果输出
val result: StringBuilder = new StringBuilder()
result.append("时间：").append( new Timestamp( timestamp - 1 ) ).append("\n")
for( i <- sortedUrlViews.indices ){
  val currentUrlView = sortedUrlViews(i)
  result.append("NO").append(i + 1).append(":")
    .append(" URL=").append(currentUrlView._1)
    .append(" 访问量=").append(currentUrlView._2).append("\n")
}
result.append("=============================")
Thread.sleep(1000)
out.collect(result.toString())

}
}
package com. gu.networkflow_analysis

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.windowing.time.Time

/**

Copyright © 2018-2028 All Rights Reserved
Project: UserBehaviorAnalysis
Package: com. gu.networkflow_analysis
Version: 1.0
Created by wushengran on 2019/9/23 10:28
*/

// 定义输入数据的样例类
case class UserBehavior( userId: Long, itemId: Long, categoryId: Int, behavior: String, timestamp: Long )

object PageView {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)

// 用相对路径定义数据源
val resource = getClass.getResource("/UserBehavior.csv")
val dataStream = env.readTextFile(resource.getPath)
  .map( data => {
    val dataArray = data.split(",")
    UserBehavior( dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong )
  } )
  .assignAscendingTimestamps(_.timestamp * 1000L)
  .filter( _.behavior == "pv" )    // 只统计pv操作
  .map( data => ("pv", 1) )
  .keyBy(_._1)
  .timeWindow(Time.hours(1))
  .sum(1)

dataStream.print("pv count")

env.execute("page view jpb")

}
}
package com. gu.networkflow_analysis

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.AllWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

/**

Copyright © 2018-2028 All Rights Reserved
Project: UserBehaviorAnalysis
Package: com. gu.networkflow_analysis
Version: 1.0
Created by wushengran on 2019/9/23 10:43
*/
case class UvCount( windowEnd: Long, uvCount: Long )

object UniqueVisitor {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)

// 用相对路径定义数据源
val resource = getClass.getResource("/UserBehavior.csv")
val dataStream = env.readTextFile(resource.getPath)
  .map( data => {
    val dataArray = data.split(",")
    UserBehavior( dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong )
  } )
  .assignAscendingTimestamps(_.timestamp * 1000L)
  .filter( _.behavior == "pv" )    // 只统计pv操作
  .timeWindowAll( Time.hours(1) )
  .apply( new UvCountByWindow() )

dataStream.print()
env.execute("uv job")

}
}

class UvCountByWindow() extends AllWindowFunction[UserBehavior, UvCount, TimeWindow]{
override def apply(window: TimeWindow, input: Iterable[UserBehavior], out: Collector[UvCount]): Unit = {
// 定义一个scala set，用于保存所有的数据userId并去重
var idSet = SetLong
// 把当前窗口所有数据的ID收集到set中，最后输出set的大小
for( userBehavior <- input ){
idSet += userBehavior.userId
}
out.collect( UvCount( window.getEnd, idSet.size ) )
}
}
package com. gu.networkflow_analysis

import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.ProcessWindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.triggers.{Trigger, TriggerResult}
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector
import redis.clients.jedis.Jedis

/**

Copyright © 2018-2028 All Rights Reserved
Project: UserBehaviorAnalysis
Package: com. gu.networkflow_analysis
Version: 1.0
Created by wushengran on 2019/9/23 11:34
*/
object UvWithBloom {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)

// 用相对路径定义数据源
val resource = getClass.getResource("/UserBehavior.csv")
val dataStream = env.readTextFile(resource.getPath)
.map(data => {
val dataArray = data.split(",")
UserBehavior(dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim.toInt, dataArray(3).trim, dataArray(4).trim.toLong)
})
.assignAscendingTimestamps(.timestamp * 1000L)
.filter(.behavior == “pv”) // 只统计pv操作
.map(data => (“dummyKey”, data.userId))
.keyBy(_._1)
.timeWindow(Time.hours(1))
.trigger(new MyTrigger())
.process(new UvCountWithBloom())

dataStream.print()

env.execute(“uv with bloom job”)
}
}

// 自定义窗口触发器
class MyTrigger() extends Trigger[(String, Long), TimeWindow] {
override def onEventTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = TriggerResult.CONTINUE

override def onProcessingTime(time: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = TriggerResult.CONTINUE

override def clear(window: TimeWindow, ctx: Trigger.TriggerContext): Unit = {}

override def onElement(element: (String, Long), timestamp: Long, window: TimeWindow, ctx: Trigger.TriggerContext): TriggerResult = {
// 每来一条数据，就直接触发窗口操作，并清空所有窗口状态
TriggerResult.FIRE_AND_PURGE
}
}

// 定义一个布隆过滤器
class Bloom(size: Long) extends Serializable {
// 位图的总大小，默认16M
private val cap = if (size > 0) size else 1 << 27

// 定义hash函数
def hash(value: String, seed: Int): Long = {
var result = 0L
for( i <- 0 until value.length ){
result = result * seed + value.charAt(i)
}
result & ( cap - 1 )
}
}

class UvCountWithBloom() extends ProcessWindowFunction[(String, Long), UvCount, String, TimeWindow]{
// 定义redis连接
lazy val jedis = new Jedis(“localhost”, 6379)
lazy val bloom = new Bloom(1<<29)

override def process(key: String, context: Context, elements: Iterable[(String, Long)], out: Collector[UvCount]): Unit = {
// 位图的存储方式，key是windowEnd，value是bitmap
val storeKey = context.window.getEnd.toString
var count = 0L
// 把每个窗口的uv count值也存入名为count的redis表，存放内容为（windowEnd -> uvCount），所以要先从redis中读取
if( jedis.hget(“count”, storeKey) != null ){
count = jedis.hget(“count”, storeKey).toLong
}
// 用布隆过滤器判断当前用户是否已经存在
val userId = elements.last._2.toString
val offset = bloom.hash(userId, 61)
// 定义一个标识位，判断reids位图中有没有这一位
val isExist = jedis.getbit(storeKey, offset)
if(!isExist){
// 如果不存在，位图对应位置1，count + 1
jedis.setbit(storeKey, offset, true)
jedis.hset(“count”, storeKey, (count + 1).toString)
out.collect( UvCount(storeKey.toLong, count + 1) )
} else {
out.collect( UvCount(storeKey.toLong, count) )
}
}
}

package com. gu.marketanalysis

import java.sql.Timestamp

import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{ValueState, ValueStateDescriptor}
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

/**

Project: UserBehaviorAnalysis
Package: com. gu.marketanalysis
Version: 1.0
Created by wushengran on 2019/9/24 10:10
*/
// 输入的广告点击事件样例类
case class AdClickEvent( userId: Long, adId: Long, province: String, city: String, timestamp: Long )
// 按照省份统计的输出结果样例类
case class CountByProvince( windowEnd: String, province: String, count: Long )
// 输出的黑名单报警信息
case class BlackListWarning( userId: Long, adId: Long, msg: String )

object AdStatisticsByGeo {
// 定义侧输出流的tag
val blackListOutputTag: OutputTag[BlackListWarning] = new OutputTagBlackListWarning

def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
env.setParallelism(1)

// 读取数据并转换成AdClickEvent
val resource = getClass.getResource("/AdClickLog.csv")
val adEventStream = env.readTextFile(resource.getPath)
  .map( data => {
    val dataArray = data.split(",")
    AdClickEvent( dataArray(0).trim.toLong, dataArray(1).trim.toLong, dataArray(2).trim, dataArray(3).trim, dataArray(4).trim.toLong )
  } )
  .assignAscendingTimestamps(_.timestamp * 1000L)

// 自定义process function，过滤大量刷点击的行为
val filterBlackListStream = adEventStream
  .keyBy( data => (data.userId, data.adId) )
  .process( new FilterBlackListUser(100) )

// 根据省份做分组，开窗聚合
val adCountStream = filterBlackListStream
  .keyBy(_.province)
  .timeWindow( Time.hours(1), Time.seconds(5) )
  .aggregate( new AdCountAgg(), new AdCountResult() )

adCountStream.print("count")
filterBlackListStream.getSideOutput(blackListOutputTag).print("blacklist")

env.execute("ad statistics job")

}

class FilterBlackListUser(maxCount: Int) extends KeyedProcessFunction[(Long, Long), AdClickEvent, AdClickEvent]{
// 定义状态，保存当前用户对当前广告的点击量
lazy val countState: ValueState[Long] = getRuntimeContext.getState(new ValueStateDescriptor[Long](“count-state”, classOf[Long]))
// 保存是否发送过黑名单的状态
lazy val isSentBlackList: ValueState[Boolean] = getRuntimeContext.getState( new ValueStateDescriptor[Boolean](“issent-state”, classOf[Boolean]) )
// 保存定时器触发的时间戳
lazy val resetTimer: ValueState[Long] = getRuntimeContext.getState( new ValueStateDescriptor[Long](“resettime-state”, classOf[Long]) )

override def processElement(value: AdClickEvent, ctx: KeyedProcessFunction[(Long, Long), AdClickEvent, AdClickEvent]#Context, out: Collector[AdClickEvent]): Unit = {
  // 取出count状态
  val curCount = countState.value()

  // 如果是第一次处理，注册定时器，每天00：00触发
  if( curCount == 0 ){
    val ts = ( ctx.timerService().currentProcessingTime()/(1000*60*60*24) + 1) * (1000*60*60*24)
    resetTimer.update(ts)
    ctx.timerService().registerProcessingTimeTimer(ts)
  }

  // 判断计数是否达到上限，如果到达则加入黑名单
  if( curCount >= maxCount ){
    // 判断是否发送过黑名单，只发送一次
    if( !isSentBlackList.value() ){
      isSentBlackList.update(true)
      // 输出到侧输出流
      ctx.output( blackListOutputTag, BlackListWarning(value.userId, value.adId, "Click over " + maxCount + " times today.") )
    }
    return
  }
  // 计数状态加1，输出数据到主流
  countState.update( curCount + 1 )
  out.collect( value )
}

override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[(Long, Long), AdClickEvent, AdClickEvent]#OnTimerContext, out: Collector[AdClickEvent]): Unit = {
  // 定时器触发时，清空状态
  if( timestamp == resetTimer.value() ){
    isSentBlackList.clear()
    countState.clear()
    resetTimer.clear()
  }
}

}
}

// 自定义预聚合函数