一、pom.xml增加依赖
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<flink.version>1.8.0</flink.version>
<java.version>1.8</java.version>
<scala.binary.version>2.11</scala.binary.version>
<maven.compiler.source>${java.version}</maven.compiler.source>
<maven.compiler.target>${java.version}</maven.compiler.target>
</properties>
<dependencies>
<!-- Apache Flink dependencies -->
<!-- These dependencies are provided, because they should not be packaged into the JAR file. -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
<scope>provided</scope>
</dependency>
<!-- Add connector dependencies here. They must be in the default scope (compile). -->
<!-- Example:
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.10_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
-->
<!-- Add logging framework, to produce console output when running in the IDE. -->
<!-- These dependencies are excluded from the application JAR by default. -->
<dependency>
<groupId>org.slf4j</groupId>
<artifactId>slf4j-log4j12</artifactId>
<version>1.7.7</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>log4j</groupId>
<artifactId>log4j</artifactId>
<version>1.2.17</version>
<scope>runtime</scope>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-wikiedits_2.11</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-core</artifactId>
<version>1.8.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_2.11</artifactId>
<version>1.8.0</version>
</dependency>
</dependencies>
<!-- Java Compiler -->
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.1</version>
<configuration>
<source>${java.version}</source>
<target>${java.version}</target>
</configuration>
</plugin>
二、编码
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.wikiedits.WikipediaEditEvent;
import org.apache.flink.streaming.connectors.wikiedits.WikipediaEditsSource;
/**
* 实时监控维基百科词条有哪些用户在修改,统计他们修改了多少字节数等等
*/
public class WikipediaAnalysis {
public static void main(String[] args) throws Exception {
// 创建一个streaming程序运行的上下文
StreamExecutionEnvironment see = StreamExecutionEnvironment.getExecutionEnvironment();
/*
source部分--数据来源部分
pom中引入flink-connector-wikiedits_2.11,已经封装好了WikipediaEditsSource,创建WikipediaEditsSource实例即可。
*/
DataStream<WikipediaEditEvent> edits = see.addSource(new WikipediaEditsSource());
/*
edits这个输入要有个key,表示某一个时间段之内每一个用户
所以这里要用keyBy,用一个Key选择器KeySelector返回每一条event(代表维基百科后台每一条修改的事件),从这个事件中把用户拿出来
现在就得到了一个keyedEdits。
*/
KeyedStream<WikipediaEditEvent, String> keyedEdits = edits
.keyBy((KeySelector<WikipediaEditEvent, String>) event -> {
return event.getUser();
});
// 使用一个窗口,并使这个窗口是5s汇总一次。
DataStream<Tuple2<String, Long>> result = keyedEdits
.timeWindow(Time.seconds(5)) // 指定窗口的宽度为5秒
.fold(new Tuple2<>("", 0L), new FoldFunction<WikipediaEditEvent, Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> fold(Tuple2<String, Long> tuple2, WikipediaEditEvent o) throws Exception {
// WikipediaEditEvent窗口中每一条信息就是维基百科的编辑信息
tuple2.f0 = o.getUser();
Long byteDiff = Long.valueOf(o.getByteDiff());
tuple2.f1 += byteDiff;
return tuple2;
}
});
result.print();
see.execute();
}
}