一.DataSet API处理有界数据流
package myflink;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class BatchWordCount {
public static void main(String[] args) throws Exception{
//执行环境
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
//数据源获取
DataSet<String> text = env.fromElements("aaa bbb ccc", "aaa bbb", "aaa");
//处理数据
DataSet<Tuple2<String, Integer>> ds = text.flatMap(new LineSplitter())
.groupBy(0) // 根据位置分组
.sum(1); //根据位置求和
//打印到控制台
ds.print();
}
//实现FlatMapFunction
static class LineSplitter implements FlatMapFunction<String, Tuple2<String,Integer>> {
//使用空格分隔 返回元组(word,1)
@Override
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
for (String word :
s.split(" ")) {
collector.collect(new Tuple2<>(word, 1));
}
}
}
}
二.DataStream 处理无界数据流
2.1定义数据源生产数据
package dataStream;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.util.ArrayList;
import java.util.List;
import java.util.Random;
public class MySource implements SourceFunction<String> {
private long count = 1L;
private boolean isRuning = true;
@Override
public void run(SourceContext sourceContext) throws Exception {
while (isRuning){
//数据集合
List<Object> str = new ArrayList<>();
str.add("word");
str.add("aaa");
str.add("bbb");
str.add("ccc");
str.add("ddd");
str.add("eee");
str.add("fff");
int i = new Random().nextInt(str.size());
sourceContext.collect(str.get(i));
//每秒一条
Thread.sleep(1000);
}
}
@Override
public void cancel() {
isRuning = false;
}
}
2.2数据流处理
package dataStream;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import util.LineSplitter;
public class StreamWountCount {
public static void main(String[] args) throws Exception{
//创建流处理执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStream<Tuple2<String,Integer>> sum = env.addSource(new MySource())
.flatMap(new LineSplitter())
.keyBy(0) //类似groupby方法,用法一致
.sum(1);
sum.print();
System.out.println(env.getExecutionPlan());
//执行任务
env.execute("WordCount");
}
}
注意:把LineSplitter 继承FlatMapFunction类提取了出来
package util;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class LineSplitter implements FlatMapFunction<String, Tuple2<String, Integer>> {
@Override
public void flatMap(String s, Collector<Tuple2<String, Integer>> collector) throws Exception {
for (String word :
s.split(" ")) {
collector.collect(new Tuple2<>(word, 1));
}
}
}
2.3利用窗口处理无界流数据
package dataStream;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import util.LineSplitter;
public class StreamTimeWindow {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStream<Tuple2<String,Integer>> sum = env.addSource(new MySource())
.flatMap(new LineSplitter())
.keyBy(0)
.timeWindow(Time.seconds(5))//指定窗口为5秒,每五秒输出一次数据
.sum(1);
sum.print();
env.execute("WordCount");
}
}
注意:指定窗口为5秒,每五秒输出一次数据,由于数据源设置为每秒产生一条数据,所以每次输出数据也为五条。
三.Table API处理数据
3.1无界流
package tableAPI;
import dataStream.MySource;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.EnvironmentSettings;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import static org.apache.flink.table.api.Expressions.$;
public class WordCountTableStream {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment senv = StreamExecutionEnvironment.getExecutionEnvironment();//获取流处理执行环境
EnvironmentSettings settings = EnvironmentSettings.newInstance() //定义所有初始化表环境的参数
// .useBlinkPlanner() //将BlinkPlanner设置为必须模块
.useOldPlanner()
.inStreamingMode() //设置组件应在流模式下工作,默认启用
.build();
StreamTableEnvironment tenv = StreamTableEnvironment.create(senv, settings); //创建Table API执行环境
DataStreamSource<String> dataStreamSource = senv.addSource(new MySource()); //获取自定义数据流
Table word = tenv.fromDataStream(dataStreamSource, $("word")); //将数据集转换为表
Table word1 = word.where($("word").like("%o%")); //将数据集转换为表
String explain = tenv.explain(word1);
System.out.println("explain = " + explain);
tenv.toAppendStream(word1, Row.class) //将给定的Table 转换为指定类型的 DataStream
.print("word1");
senv.execute(); //执行任务操作
}
}
在新一版的Flink中增加了Blink分支的Table Planner,同时Blink分支的Planner提供了更多的内置函数,更标准的SQL支持。
3.2有界流
3.2.1创建实体类
package entity;
public class MyOrder {
private long id;
private String product;
private int amount;
public MyOrder() {
}
public MyOrder(long id, String product, int amount) {
this.id = id;
this.product = product;
this.amount = amount;
}
public long getId() {
return id;
}
public void setId(long id) {
this.id = id;
}
public String getProduct() {
return product;
}
public void setProduct(String product) {
this.product = product;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
@Override
public String toString() {
return "MyOrder{" +
"id=" + id +
", product='" + product + '\'' +
", amount=" + amount +
'}';
}
}
3.2.2DataSet 转 Table
package tableAPI;
import entity.MyOrder;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.BatchTableEnvironment;
import static org.apache.flink.table.api.Expressions.$;
public class TableBatch {
public static void main(String[] args) throws Exception{
//获取执行环境
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
//创建Table执行环境
BatchTableEnvironment tEnv = BatchTableEnvironment.create(env);
//加载或创建数据源
DataSource<MyOrder> input = env.fromElements(new MyOrder(1l, "aaa", 1),
new MyOrder(8l, "asd", 8),
new MyOrder(8l, "asd", 8),
new MyOrder(3l, "abc", 20)
);
//DataStream 转化为 Table
Table table = tEnv.fromDataSet(input);
//执行过滤操作
Table filtered = table.where($("amount").isGreaterOrEqual(8));
//table-->DataSet
DataSet<MyOrder> result = tEnv.toDataSet(filtered, MyOrder.class);
result.print();
}
}
四.SQL 处理有界无界数据
4.1有界流
package tableAPI;
import entity.MyOrder;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.BatchTableEnvironment;
import org.apache.flink.types.Row;
import static org.apache.flink.table.api.Expressions.$;
public class SQLBatch {
public static void main(String[] args) throws Exception{
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
BatchTableEnvironment tEnv = BatchTableEnvironment.create(env);
DataSet<MyOrder> input = env.fromElements(
new MyOrder(1l, "aaa", 1),
new MyOrder(8l, "asd", 8),
new MyOrder(8l, "asd", 8),
new MyOrder(3l, "abc", 20)
);
tEnv.createTemporaryView("MyOrder",input,$("id"),$("product"),$("amount"));
Table table = tEnv.sqlQuery("select product,sum(amount) as amount from MyOrder group by product");
tEnv.toDataSet(table, Row.class).print();
}
}
4.2无界流
package tableAPI;
import dataStream.MySource;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.bridge.java.StreamTableEnvironment;
import org.apache.flink.types.Row;
import static org.apache.flink.table.api.Expressions.$;
public class SQLStream {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tEnv = StreamTableEnvironment.create(env);
DataStream<String> stream = env.addSource(new MySource());
Table word = tEnv.fromDataStream(stream, $("word"));
Table table = tEnv.sqlQuery("select * from " + word + " where word like '%a%'");
tEnv.toAppendStream(table, Row.class).print();
env.execute();
}
}