编程模型
Flink提供了不同级别的编程抽象,通过调用抽象的数据集调用算子构建DataFlow就可以实现对分布式的数据进行流式计算和离线计算,DataSet是批处理的抽象数据集,DataStream是流式计算的抽象数据集,他们的方法都分别为Source、Transformation、Sink
- Source主要负责数据的读取
- Transformation主要负责对数据的转换操作
- Sink负责最终计算好的结果数据输出。
DataStream实时wordcount
package com.wedoctor.flink
import org.apache.flink.streaming.api.scala._
object WordCountDemo {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val lines: DataStream[String] = env.socketTextStream("192.168.xx.xx",9999)
val words: DataStream[String] = lines.flatMap(_.split(" "))
val wordWithOne: DataStream[(String, Int)] = words.map((_,1))
val keyedData: KeyedStream[(String, Int), String] = wordWithOne.keyBy(_._1)
val sumData: DataStream[(String, Int)] = keyedData.sum(1)
sumData.print()
env.execute("Flink WordCount")
}
}
DataSet 离线wordcount
package com.wedoctor.flink
import org.apache.flink.api.scala._
object WordCountDemo2 {
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val words: DataSet[Int] = env.fromElements(1,2,3)
val tt: DataSet[Int] = words.map(t=>t*2)
tt.print()
}
}
Flink常见算子
1.map
package com.wedoctor.flink;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class MapTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> lines = env.socketTextStream("192.168.xx.xx", 9999);
SingleOutputStreamOperator<String> words = lines.map(new MapFunction<String, String>() {
@Override
public String map(String value) throws Exception {
return value.toUpperCase();
}
});
words.print();
env.execute();
}
}
2.RichMapFunction
package com.wedoctor.flink;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.api.common.functions.RichMapFunction;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class RichMapTest {
//RichMapFunction
//1.可以获取运行时上下文,可以得到很多的信息,subTaskIndex、状态数据等
//2.还可以使用两个生命周期方法、open和close
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> lines = env.socketTextStream("192.168.XX.XX", 9999);
SingleOutputStreamOperator<String> map = lines.map(new RichMapFunction<String, String>() {
//构造对象完成后,map方法执行之前,执行一次
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
//此处可以建立连接
}
@Override
public String map(String value) throws Exception {
//处理数据
return value + "222222222";
}
//subtask在停止之前,执行一次
@Override
public void close() throws Exception {
super.close();
//关闭连接
}
});
map.print();
env.execute();
}
}
3.flatMap
package com.wedoctor.flink;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class FlatMapTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> lines = env.socketTextStream("192.168.xx.xx", 9999);
SingleOutputStreamOperator<String> flatMap = lines.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String value, Collector<String> collector) throws Exception {
String[] words = value.split(" ");
for (String word : words) {
collector.collect(word);
}
}
});
flatMap.print();
env.execute();
}
}
4.filter
package com.wedoctor.flink;
import org.apache.flink.api.common.functions.FilterFunction;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class RichMapTest {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> lines = env.socketTextStream("192.168.xx.xx", 9999);
SingleOutputStreamOperator<String> filter = lines.filter(new FilterFunction<String>() {
@Override
public boolean filter(String value) throws Exception {
return value.length() == 2;
}
});
filter.print();
env.execute();
}
}
5.keyBy
5.1.单个字段keyby
package com.wedoctor.flink;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class KeyByDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> lines = env.socketTextStream("192.168.xx.xx", 9999);
SingleOutputStreamOperator<Tuple2<String, Integer>> flatMap = lines.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
String[] words = s.split(" ");
for (String word : words) {
collector.collect(Tuple2.of(word, 1));
}
}
});
//按照单个字段分组 keyby
KeyedStream<Tuple2<String, Integer>, Tuple> keyBy = flatMap.keyBy(0);
KeyedStream<Tuple2<String, Integer>, String> keyBy1 = flatMap.keyBy(t -> t.f0);
keyBy.print();
keyBy1.print();
env.execute();
}
}
5.2 多个字段keyBy(过时API)
package com.wedoctor.flink;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class KeyByDemo {
public static void main(String[] args) throws Exception {
// jack 01 1232
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> lines = env.socketTextStream("192.168.xx.xx", 9999);
SingleOutputStreamOperator<Tuple3<String, String, Integer>> map = lines.map(new MapFunction<String, Tuple3<String, String, Integer>>() {
@Override
public Tuple3<String, String, Integer> map(String s) throws Exception {
String[] words = s.split(" ");
String userId = words[0];
String monthId = words[1];
Integer orderCnt = Integer.parseInt(words[2]);
return Tuple3.of(userId, monthId, orderCnt);
}
});
KeyedStream<Tuple3<String, String, Integer>, Tuple> key = map.keyBy(0, 1);
SingleOutputStreamOperator<Tuple3<String, String, Integer>> summed = key.sum(2);
summed.print();
env.execute();
}
}
5.3.多个字段KeyBy(新API,Tuple封装)
package com.wedoctor.flink;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class KeyByDemo {
public static void main(String[] args) throws Exception {
// jack 01 1232
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> lines = env.socketTextStream("192.168.xx.xx", 9999);
SingleOutputStreamOperator<Tuple3<String, String, Integer>> map = lines.map(new MapFunction<String, Tuple3<String, String, Integer>>() {
@Override
public Tuple3<String, String, Integer> map(String s) throws Exception {
String[] words = s.split(" ");
String userId = words[0];
String monthId = words[1];
Integer orderCnt = Integer.parseInt(words[2]);
return Tuple3.of(userId, monthId, orderCnt);
}
});
KeyedStream<Tuple3<String, String, Integer>, String> keyBy = map.keyBy(t -> t.f0 + t.f1);
SingleOutputStreamOperator<Tuple3<String, String, Integer>> summed = keyBy.sum(2);
summed.print();
env.execute();
}
}
5.4 多个字段KeyBy(POJO封装,终极)
package com.wedoctor.flink;
public class WordCount {
public String word;
public Integer count;
public WordCount(String word, Integer count) {
this.word = word;
this.count = count;
}
public WordCount() {
}
public static WordCount of(String word,Integer count){
return new WordCount(word,count);
}
@Override
public String toString() {
return "WordCount{" +
"word='" + word + '\'' +
", count=" + count +
'}';
}
}
package com.wedoctor.flink;
public class WordCount {
public String word;
public Integer count;
public WordCount(String word, Integer count) {
this.word = word;
this.count = count;
}
public WordCount() {
}
public static WordCount of(String word,Integer count){
return new WordCount(word,count);
}
@Override
public String toString() {
return "WordCount{" +
"word='" + word + '\'' +
", count=" + count +
'}';
}
}
6.reduce
package com.wedoctor.flink;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.util.Collector;
public class ReduceDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> lines = env.socketTextStream("192.168.xx.xx", 9999);
SingleOutputStreamOperator<Tuple2<String, Integer>> flatMap = lines.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() {
@Override
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
String[] words = s.split(" ");
for (String word : words) {
collector.collect(Tuple2.of(word, 1));
}
}
});
SingleOutputStreamOperator<Tuple2<String, Integer>> reduce = flatMap.keyBy(t -> t.f0).reduce(new ReduceFunction<Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> reduce(Tuple2<String, Integer> t1, Tuple2<String, Integer> t2) throws Exception {
return Tuple2.of(t1.f0, t1.f1 + t2.f1);
}
});
reduce.print();
env.execute();
}
}
7.Aggregations
7.1 sum
package com.wedoctor.flink;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class KeyByDemo {
public static void main(String[] args) throws Exception {
// jack 01 1232
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> lines = env.socketTextStream("192.168.xx.xx", 9999);
SingleOutputStreamOperator<Tuple3<String, String, Integer>> map = lines.map(new MapFunction<String, Tuple3<String, String, Integer>>() {
@Override
public Tuple3<String, String, Integer> map(String s) throws Exception {
String[] words = s.split(" ");
String userId = words[0];
String monthId = words[1];
Integer orderCnt = Integer.parseInt(words[2]);
return Tuple3.of(userId, monthId, orderCnt);
}
});
KeyedStream<Tuple3<String, String, Integer>, String> keyBy = map.keyBy(t -> t.f0 + t.f1);
SingleOutputStreamOperator<Tuple3<String, String, Integer>> summed = keyBy.sum(2);
summed.print();
env.execute();
}
}
7.2 min
package com.wedoctor.flink;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class AggDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> lines = env.socketTextStream("192.168.x.xx", 9999);
SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndCnt = lines.map(new MapFunction<String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(String s) throws Exception {
String[] fileds = s.split(" ");
String word = fileds[0];
int cnt = Integer.parseInt(fileds[1]);
return Tuple2.of(word, cnt);
}
});
//按照单个字段分组 keyby
KeyedStream<Tuple2<String, Integer>, String> keyBy1 = wordAndCnt.keyBy(t -> t.f0);
keyBy1.min(1).print();
env.execute();
}
}
7.3 max
package com.wedoctor.flink;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class AggDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> lines = env.socketTextStream("192.168.xx.xx", 9999);
SingleOutputStreamOperator<Tuple2<String, Integer>> wordAndCnt = lines.map(new MapFunction<String, Tuple2<String, Integer>>() {
@Override
public Tuple2<String, Integer> map(String s) throws Exception {
String[] fileds = s.split(" ");
String word = fileds[0];
int cnt = Integer.parseInt(fileds[1]);
return Tuple2.of(word, cnt);
}
});
//按照单个字段分组 keyby
KeyedStream<Tuple2<String, Integer>, String> keyBy1 = wordAndCnt.keyBy(t -> t.f0);
keyBy1.max(1).print();
env.execute();
}
}
7.4 minBy
package com.wedoctor.flink;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class AggDemo {
//lucy 2020-05 15
//jack 2020-02 25
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> lines = env.socketTextStream("192.168.x.xx", 9999);
SingleOutputStreamOperator<Tuple3<String,String, Integer>> map = lines.map(new MapFunction<String, Tuple3<String,String, Integer>>() {
@Override
public Tuple3<String, String,Integer> map(String s) throws Exception {
String[] fileds = s.split(" ");
String userId = fileds[0];
String monthId = fileds[1];
int orderCnt = Integer.parseInt(fileds[2]);
return Tuple3.of(userId,monthId,orderCnt);
}
});
KeyedStream<Tuple3<String, String, Integer>, String> keyBy = map.keyBy(t -> t.f0);
keyBy.minBy(2,false).print();
env.execute();
}
}
7.5 maxBy
package com.wedoctor.flink;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class AggDemo {
//lucy 2020-05 15
//jack 2020-02 25
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<String> lines = env.socketTextStream("192.168.xx.xx", 9999);
SingleOutputStreamOperator<Tuple3<String,String, Integer>> map = lines.map(new MapFunction<String, Tuple3<String,String, Integer>>() {
@Override
public Tuple3<String, String,Integer> map(String s) throws Exception {
String[] fileds = s.split(" ");
String userId = fileds[0];
String monthId = fileds[1];
int orderCnt = Integer.parseInt(fileds[2]);
return Tuple3.of(userId,monthId,orderCnt);
}
});
KeyedStream<Tuple3<String, String, Integer>, String> keyBy = map.keyBy(t -> t.f0);
keyBy.maxBy(2,false).print();
env.execute();
}
}
8 union
package com.wedoctor.flink;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
public class UnionDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//调用Source创建DataStream
DataStreamSource<Integer> s1 = env.fromElements(1, 2, 3, 4, 5);
DataStreamSource<Integer> s2 = env.fromElements(5, 7, 8, 9, 10);
DataStream<Integer> unioned = s1.union(s2);
unioned.print();
env.execute();
}
}
DataSet API
DataStream API
和DataSet API的区别为DataStream输入为一个无限的流
DataStream独有的API
参考
https://mp.weixin.qq.com/s?__biz=MzIxMjI3NTI5OQ==&mid=2650461709&idx=1&sn=b6f027e02ae9632a38766b5243c4ed32&chksm=8f46ef01b831661733e1c2f78e7f50fd2d806b9032a1b110b60692dd242700f41e90b7d6f474&scene=21#wechat_redirect
Flink DataStream常用算子
Flink的DataSet基本算子总结
Flink系列:常用算子一览表
官网地址