flink示例 - Wikipedia词条编辑实时流计算,Flink Scala api实现
Flink官方的一个示例(https://ci.apache.org/projects/flink/flink-docs-release-1.8/tutorials/datastream_api.html)
作用:监控Wikipedia 实时编辑时间
官方示例是Java api实现的,文章中改为了更为简洁的Scala api,仅供参考
项目代码部分
pom.xml
<properties>
<flink.version>1.7.0</flink.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-wikiedits_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.12</version>
</dependency>
</dependencies>
WikipediaEditMonitor.scala
package com.mybigdata
import org.apache.flink.api.common.functions.{FoldFunction}
import org.apache.flink.api.java.functions.KeySelector
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.connectors.wikiedits.{WikipediaEditEvent, WikipediaEditsSource}
object WikipediaEditMonitor {
def main(args: Array[String]): Unit = {
// 输入参数获取, 窗口时间,单位s
val params = ParameterTool.fromArgs(args)
var size = Time.seconds(5)
if(params.has("size")){
size = Time.seconds(params.getInt("size"))
}
printf("Window size:%d %s\n" , size.getSize ,size.getUnit.toString.toLowerCase)
// 获取运行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
// 获取DataStream
val wikiEditsDataStream = env.addSource(new WikipediaEditsSource())
// 流数据转换
val keyedEdits = wikiEditsDataStream.keyBy(new KeySelector[WikipediaEditEvent, String] (){
override def getKey(event: WikipediaEditEvent): String = event.getUser
})
val result = keyedEdits
.timeWindow(size)
.fold(new Tuple2("",0L), new FoldFunction[WikipediaEditEvent, (String, Long)] {
override def fold(acc: (String, Long), event: WikipediaEditEvent): (String, Long) = {
(event.getUser, acc._2 + event.getByteDiff)
}
})
// 输出结果
result.print()
// 执行
env.execute("统计维基百科编辑词条")
}
}
结果
Window size:2 seconds
2> (ST47,27)
1> (05:204:5486:383F:F847:DCD5:196D:4542,32)
7> (FunkMonk,34)
7> (STATicVapor,289)
5> (Tetsou TheIronman,6)
5> (Longrunsthefox,102)
1> (Kku,4)
7> (Arjayay,-21)
8> (Onel5969,50)
4> (TyMega,14)
4> (9.76.90.79,7)
3> (Llammakey,-4)
3> (PBS,49)
...
...