初识 Flink
Flink 官网
![flink1](https://img-blog.csdnimg.cn/20210217205244449.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzQxMzAxNzA3,size_16,color_FFFFFF,t_70#pic_center)
1. 什么是 Flink
- Apache Flink是一个框架和分布式处理引擎,用于在无界和有界数据流上进行有状态计算。Flink被设计成在所有常见的集群环境中运行,以内存速度和任何规模执行计算。
- Flink 用于处理 Unbounded 、Bounded 数据。
- Unbounded 有定义流的开始,但没有定义流的结束。它们会无休止地产生数据。无界流的数据必须持续处理,即数据被摄取后需要立刻处理。我们不能等到所有数据都到达再处理,因为输入是无限的,在任何时候输入都不会完成。处理无界数据通常要求以特定顺序摄取事件,例如事件发生的顺序,以便能够推断结果的完整性。
- Bounded 有定义流的开始,也有定义流的结束。有界流可以在摄取所有数据后再进行计算。有界流所有数据可以被排序,所以并不需要有序摄取。有界流处理通常被称为批处理。
![flink2](https://img-blog.csdnimg.cn/20210217205509100.png?x-oss-process=image/watermark,type_ZmFuZ3poZW5naGVpdGk,shadow_10,text_aHR0cHM6Ly9ibG9nLmNzZG4ubmV0L3FxXzQxMzAxNzA3,size_16,color_FFFFFF,t_70#pic_center)
- Apache Flink 是一个分布式系统,它需要计算资源来执行应用程序。Flink 集成了所有常见的集群资源管理器,例如 Hadoop YARN、 Apache Mesos 和 Kubernetes,但同时也可以作为独立集群运行。
2. Flink Batch、Streaming Demo
2.1 POM 文件增加
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.12</artifactId>
<version>1.11.2</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_2.12}</artifactId>
<version>1.11.2</version>
</dependency>
2.2 Batch WC
hadoop,spark,flink
spark,hadoop
spark
package com.xk.bigdata.flink.basic.scala
import org.apache.flink.api.scala.ExecutionEnvironment
import org.apache.flink.api.scala._
object BatchWcApp {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
env.readTextFile("data/wc.txt")
.flatMap(_.toLowerCase.split(","))
.map((_, 1))
.groupBy(0)
.sum(1)
.print()
}
}
package com.xk.bigdata.flink.basic.java;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class BatchWcApp {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
env.readTextFile("data/wc.txt")
.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String s, Collector<String> collector) throws Exception {
String[] words = s.toLowerCase().split(",");
for (String word : words) {
collector.collect(word);
}
}
})
.map(new MapFunction<String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(String s) throws Exception {
return new Tuple2<>(s, 1);
}
})
.groupBy(0)
.sum(1)
.print();
}
}
2.3 Streaming WC
package com.xk.bigdata.flink.basic.scala
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
object StreamingWcApp {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
env.socketTextStream("bigdata", 16666)
.flatMap(_.toLowerCase.split(","))
.map((_, 1))
.keyBy(_._1)
.sum(1)
.print()
env.execute(this.getClass.getSimpleName)
}
}
package com.xk.bigdata.flink.basic.java;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class StreamingWcApp {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.socketTextStream("bigdata", 16666)
.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String s, Collector<String> collector) throws Exception {
String[] words = s.toLowerCase().split(",");
for (String word : words) {
collector.collect(word);
}
}
})
.map(new MapFunction<String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(String s) throws Exception {
return new Tuple2<>(s, 1);
}
})
.keyBy(0)
.sum(1)
.print();
env.execute(StreamingWcApp.class.getSimpleName());
}
}
[root@bigdata ~]# nc -lk 16666
hadoop,spark,flink
spark
flink
hadoop
1> (spark,1)
7> (flink,1)
8> (hadoop,1)
1> (spark,2)
7> (flink,2)
8> (hadoop,2)
8> (hadoop,1)
1> (spark,1)
7> (flink,1)
1> (spark,2)
7> (flink,2)
8> (hadoop,2)