enviroment --> source --> transform --> sink
第一步:创建运行环境
getExecutionEnvironment 已经做了集成
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
可以在env 进行并行度配置 如果没有配置 以flink-conf.yaml 中的配置为准,默认为1;
createLocalEnviroment
返回本地执行环境,需要在调用时指定默认的并行度
LocalStreamExecutionEnvironment env= StreamExecutionEnvironment.createLocalEnviroment(1);
createRemoteEnviroment
返回集群执行环境,将jar提交到远程服务器,需要在调用时指定JobManager到ip和端口号,并指定集群中运行到jar包;
LocalStreamExecutionEnvironment env= StreamExecutionEnvironment.createRemoteEnviroment("jobmanage-hostname",6123,"/Users/code/testDemo/target/demo-0.0.1-SNAPSHOT.jar");
2.1 Source
2.1.1从集合中读取数据:
public class SourceTest1_Collection {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//1.Source : 从集合中读取数据
DataStream<SensorReading> dataStream = env.fromCollection(Arrays.asList(
new SensorReading("sensor_1", 188888888, 35.0),
new SensorReading("sensor_1", 177777777, 35.0),
new SensorReading("sensor_1", 166666666, 35.0),
new SensorReading("sensor_1", 155555555, 35.0)
));
//或者可以这样
DataStream<Integer> integerDataStream = env.fromElements(1, 2, 0, 67, 189);
dataStream.print("data");
integerDataStream.print("int");
//执行
env.execute();
}
}
2.1.2 从文件中读取数据
public class SourceTest2_File {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//从文件中读取数据
DataStream<String> dataStream = env.readTextFile("/Users/code/testDemo/src/main/resources/sensor.txt");
//打印输出
dataStream.print();
env.execute();
}
}
从kafka中读取数据
引入依赖:
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.11_2.12</artifactId>
<version>1.10.0</version>
</dependency>
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
Properties properties = new Properties();
properties.setProperty("bootstrap.servers","localhost:8080");
//以下这些可以不指定
// properties.setProperty("group.id","consumer-group");
// properties.setProperty("key.deserializer","org.apache.kafka.common.serialization.StringDeserializer");
// properties.setProperty("value.deserializer","org.apache.kafka.common.serialization.StringDeserializer");
// properties.setProperty("auto.offset.reset","latest");
DataStream<String> dataStream = env.addSource(new FlinkKafkaConsumer011<String>("sensor",new SimpleStringSchema(),properties));
//打印输出
dataStream.print();
env.execute();
}
自定义Source:模拟数据
除了以上的source数据来源,我们还可以自定义source,需要做的,只是传入一个SourceFunction就可以,具体调用如下:
public class SourceTest4_UDF {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//从文件中读取数据
DataStream<SensorReading> dataStream = env.addSource(new MySensorSource());
//打印输出
dataStream.print();
env.execute();
}
//实现自定义的SourceFunction
private static class MySensorSource implements SourceFunction<SensorReading> {
//定义标志位 控制数据的产生
private boolean running = true;
@Override
public void run(SourceContext<SensorReading> sourceContext) throws Exception {
//定义随机数发生器
Random random = new Random();
//设置10个传感器的初始温度
HashMap<String, Double> sensorTempMap = new HashMap<>();
for (int i = 0 ; i<10 ; i++) {
//nextGaussian +-3之间
sensorTempMap.put("sensor_"+ (i+1),60 + random.nextGaussian()*20);
}
while (running) {
for (String sensorId : sensorTempMap.keySet()) {
// 在当前温度基础上随机波动
Double newtemp = sensorTempMap.get(sensorId) + random.nextGaussian();
sensorTempMap.put(sensorId,newtemp);
sourceContext.collect(new SensorReading(sensorId,System.currentTimeMillis(),newtemp));
}
//控制输出频率
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
}
}
3.1 转换算子 : Transform
public class TransformTest1_Base {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// env.setParallelism(1);
//读取数据
//从文件中读取数据
DataStream<String> dataStream = env.readTextFile("/Users/code/testDemo/src/main/resources/sensor.txt");
//进行数据转换:1.map 把String转换成长度输出
DataStream<Integer> mapStream = dataStream.map(new MapFunction<String, Integer>() {
@Override
public Integer map(String value) throws Exception {
return value.length();
}
});
//2.flatmap 按逗号进行分词
DataStream<String> flatMapStream = dataStream.flatMap(new FlatMapFunction<String, String>() {
@Override
public void flatMap(String value, Collector<String> out) throws Exception {
String[] split = value.split(",");
for (String s : split) {
out.collect(s);
}
}
});
//3.filter, 比如筛选sensor_1开头的id对应的数据
DataStream<String> filterStream = dataStream.filter(new FilterFunction<String>() {
@Override
public boolean filter(String value) throws Exception {
return value.startsWith("sensor_1");
}
});
//4.打印输出
dataStream.print("map");
dataStream.print("flatmap");
dataStream.print("filter");
env.execute();
}
}
聚合统计:KeyBy
DataStream --> KeyedStream:逻辑地将一个流拆分成不相交的分区,每个分区包含相同key的元素,在内部以hash的形式实现的
public class TransformTest2_RollingAggregation {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//读取数据
//从文件中读取数据
DataStream<String> inputStream = env.readTextFile("/Users/code/testDemo/src/main/resources/sensor.txt");
//转换成sensorReading类型
// SingleOutputStreamOperator<SensorReading> dataStream = inputStream.map(new MapFunction<String, SensorReading>() {
// @Override
// public SensorReading map(String value) throws Exception {
// String[] fields = value.split(",");
// return new SensorReading(fields[0],new Long(fields[1]),new Double(fields[2]));
// }
// });
SingleOutputStreamOperator<SensorReading> dataStream = inputStream.map(line -> {
String[] fields = line.split(",");
return new SensorReading(fields[0],new Long(fields[1]),new Double(fields[2]));
});
//进行分组
KeyedStream<SensorReading, Tuple> keyedStream = dataStream.keyBy("id");
// KeyedStream<SensorReading, String> keyedStream1 = dataStream.keyBy(data -> data.getId());
// KeyedStream<SensorReading, String> keyedStream1 = dataStream.keyBy(SensorReading :: getId);
//进行滚动集合 据当前最大温度值
// SingleOutputStreamOperator<SensorReading> resultStream = keyedStream.max("temperature");
SingleOutputStreamOperator<SensorReading> resultStream = keyedStream.maxBy("temperature");
resultStream.print();
env.execute();
}
}
Reduce:
DataStream --> KeyedStream:一个分组聚合流的操作,合并当前的元素和上次聚合的结果,产生一个新的值,返回的流中包含每一次聚合的结果,而不是只是返回最后一次的最终结果。
public class TransformTest3_Reduce {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//读取数据
//从文件中读取数据
DataStream<String> inputStream = env.readTextFile("/Users/code/testDemo/src/main/resources/sensor.txt");
SingleOutputStreamOperator<SensorReading> dataStream = inputStream.map(line -> {
String[] fields = line.split(",");
return new SensorReading(fields[0],new Long(fields[1]),new Double(fields[2]));
});
//进行分组
KeyedStream<SensorReading, Tuple> keyedStream = dataStream.keyBy("id");
//reduce聚合,取最大的温度值,以及当前最新的时间戳
SingleOutputStreamOperator<SensorReading> reduce = keyedStream.reduce(new ReduceFunction<SensorReading>() {
@Override
public SensorReading reduce(SensorReading value1, SensorReading value2) throws Exception {
return new SensorReading(value1.getId(), value2.getTimestamp(), Math.max(value1.getTemperature(), value2.getTemperature()));
}
});
reduce.print();
env.execute();
}
}
Split和Select:目前已废弃
SplitStream -> DataStream : 从一个Splitstream中获取一个或者多个DataStream需求,传感器数据按照温度高低,拆分成两个流;
public class TransformTest4_MutipleStream {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//读取数据
//从文件中读取数据
DataStream<String> inputStream = env.readTextFile("/Users/code/testDemo/src/main/resources/sensor.txt");
SingleOutputStreamOperator<SensorReading> dataStream = inputStream.map(line -> {
String[] fields = line.split(",");
return new SensorReading(fields[0],new Long(fields[1]),new Double(fields[2]));
});
//进行分流操作:按照温度值30度为界分成2调流
SplitStream<SensorReading> splitStream = dataStream.split(new OutputSelector<SensorReading>() {
@Override
public Iterable<String> select(SensorReading value) {
return (value.getTemperature() > 30 ) ? Collections.singletonList("high") :Collections.singletonList("low");
}
});
DataStream<SensorReading> high = splitStream.select("high");
DataStream<SensorReading> low = splitStream.select("low");
DataStream<SensorReading> all = splitStream.select("high","low");
high.print("high");
low.print("low");
all.print("all");
env.execute();
}
}
Connect和CoMap:缺陷只能连两条流 数据类型可以不一样
DataStream,DataStream -> ConnectedStreams; 连接两个保持他们类型的数据流,两个数据流被Connect之后,只是被放在了一个同一个流中,内部依然被保持各自的数据和形式不发生任何改变,两个流相互独立;
public class TransformTest4_MutipleStream {
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//读取数据
//从文件中读取数据
DataStream<String> inputStream = env.readTextFile("/Users/code/testDemo/src/main/resources/sensor.txt");
SingleOutputStreamOperator<SensorReading> dataStream = inputStream.map(line -> {
String[] fields = line.split(",");
return new SensorReading(fields[0],new Long(fields[1]),new Double(fields[2]));
});
//进行分流操作:按照温度值30度为界分成2调流
SplitStream<SensorReading> splitStream = dataStream.split(new OutputSelector<SensorReading>() {
@Override
public Iterable<String> select(SensorReading value) {
return (value.getTemperature() > 30 ) ? Collections.singletonList("high") :Collections.singletonList("low");
}
});
DataStream<SensorReading> high = splitStream.select("high");
DataStream<SensorReading> low = splitStream.select("low");
DataStream<SensorReading> all = splitStream.select("high","low");
//进行合流:将将高温流转换成二元祖类型,与低温流连接合并输出一个状态信息
DataStream<Tuple2<String, Double>> warningStream = high.map(new MapFunction<SensorReading, Tuple2<String, Double>>() {
@Override
public Tuple2<String, Double> map(SensorReading value) throws Exception {
return new Tuple2<>(value.getId(), value.getTemperature());
}
});
ConnectedStreams<Tuple2<String, Double>, SensorReading> connectStream = warningStream.connect(low);
DataStream<Object> mapStream = connectStream.map(new CoMapFunction<Tuple2<String, Double>, SensorReading, Object>() {
@Override
public Object map1(Tuple2<String, Double> value) throws Exception {
return new Tuple3<>(value.f0, value.f1, "high temp warning");
}
@Override
public Object map2(SensorReading value) throws Exception {
return new Tuple2<>(value.getId(), "normal");
}
});
mapStream.print();
env.execute();
}
}
Union 缺陷 数据类型必须一样
DataStream -> DataStream: 对两个或者两个以上对DataStream进行union操作,产生一个包含所有DataStream元素对新DataStream。
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//读取数据
//从文件中读取数据
DataStream<String> inputStream = env.readTextFile("/Users/code/testDemo/src/main/resources/sensor.txt");
SingleOutputStreamOperator<SensorReading> dataStream = inputStream.map(line -> {
String[] fields = line.split(",");
return new SensorReading(fields[0],new Long(fields[1]),new Double(fields[2]));
});
//进行分流操作:按照温度值30度为界分成2调流
SplitStream<SensorReading> splitStream = dataStream.split(new OutputSelector<SensorReading>() {
@Override
public Iterable<String> select(SensorReading value) {
return (value.getTemperature() > 30 ) ? Collections.singletonList("high") :Collections.singletonList("low");
}
});
DataStream<SensorReading> high = splitStream.select("high");
DataStream<SensorReading> low = splitStream.select("low");
DataStream<SensorReading> all = splitStream.select("high","low");
//3.union 联合多条流
DataStream<SensorReading> union = high.union(low, all);
union.print();
env.execute();
}
Flink支持所有对基础数据类型
实现UDF函数类------更细粒度的控制流
自定义类 实现接口
富函数(Rich Functions)
“富函数” 是DataStream API 提供的一个函数类的接口,所有的Flink函数类都有器Rich版本,它与常规的函数不同在于,可以获取运行环境的上下文,并拥有一些生命周期方法,所以可以实现更复杂的功能;
RichMapFunctions
RichFlatMapFunctions
RichFilterFunctions
open()方法是rich function 的初始化方法
close()销毁
public class TansformTest6_RichFunction {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//读取数据
//从文件中读取数据
DataStream<String> inputStream = env.readTextFile("/Users/code/testDemo/src/main/resources/sensor.txt");
SingleOutputStreamOperator<SensorReading> dataStream = inputStream.map(line -> {
String[] fields = line.split(",");
return new SensorReading(fields[0],new Long(fields[1]),new Double(fields[2]));
});
DataStream<Tuple2<String, Integer>> resultStream = dataStream.map(new Mymapper());
resultStream.print();
env.execute();
}
private static class Mymapper0 implements MapFunction<SensorReading,Tuple2<String,Integer>> {
@Override
public Tuple2<String, Integer> map(SensorReading value) throws Exception {
return new Tuple2<>(value.getId(),value.getId().length());
}
}
//实现自定义富函数类
private static class Mymapper extends RichMapFunction<SensorReading,Tuple2<String,Integer>> {
@Override
public Tuple2<String, Integer> map(SensorReading value) throws Exception {
return new Tuple2<>(value.getId(),getRuntimeContext().getIndexOfThisSubtask());
}
@Override
public void open(Configuration parameters) throws Exception {
//初始化工作 一般是定义状态,或者 统一跟数据库进行连接
System.out.println("open()");
}
@Override
public void close() throws Exception {
//一般是关闭连接和清空状态的收尾操作
System.out.println("close()");
}
}
}
public static void main(String[] args) throws Exception{
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//env.setParallelism(1);
//读取数据
//从文件中读取数据
DataStream<String> inputStream = env.readTextFile("/Users/code/testDemo/src/main/resources/sensor.txt");
DataStream<SensorReading> dataStream = inputStream.map(line -> {
String[] fields = line.split(",");
return new SensorReading(fields[0],new Long(fields[1]),new Double(fields[2]));
});
inputStream.print("input");
//1.shufle
DataStream<String> shuffleStream = inputStream.shuffle();
shuffleStream.print("shuffle");
//2.keyBy
dataStream.keyBy("id").print("keyBy");
//3.global 都是在一个分区
dataStream.global().print("global");
env.execute();
}
Sink
Flink没有类似于spark中的foreach方法,让用户进行迭代的操作,虽然对外的输出操作都要利用Sink完成,最后通过类似如下方式完成整个任务最终输出操作;
stream.addSink(new MySink(xxx))
搭建好kafka 启动即可使用
public class SinkTest1_kafka {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//从文件中读取数据
DataStream<String> inputStream = env.readTextFile("/Users/code/testDemo/src/main/resources/sensor.txt");
DataStream<String> dataStream = inputStream.map(line -> {
String[] fields = line.split(",");
return new SensorReading(fields[0],new Long(fields[1]),new Double(fields[2])).toString();
});
dataStream.addSink(new FlinkKafkaProducer011<String>("localhost : 8080","sinktest",new SimpleStringSchema()));
env.execute();
}
}
redis 使用
<dependency>
<groupId>org.apache.bahir</groupId>
<artifactId>flink-connector-redis_2.11</artifactId>
<version>1.0</version>
</dependency>
public class SinkTest1_redis {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//从文件中读取数据
DataStream<String> inputStream = env.readTextFile("/Users/code/testDemo/src/main/resources/sensor.txt");
DataStream<SensorReading> dataStream = inputStream.map(line -> {
String[] fields = line.split(",");
return new SensorReading(fields[0],new Long(fields[1]),new Double(fields[2]));
});
//定义jedis连接配置
FlinkJedisPoolConfig config = new FlinkJedisPoolConfig.Builder()
.setHost("localhost")
.setPort(6379)
.build();
dataStream.addSink(new RedisSink<>(config, new RedisMapper<SensorReading>() {
//定义保存数据到redis命令,存成Hash表,hset sensor_temp id temperature
@Override
public RedisCommandDescription getCommandDescription() {
return new RedisCommandDescription(RedisCommand.HSET,"sensor_temp");
}
@Override
public String getKeyFromData(SensorReading data) {
return data.getId();
}
@Override
public String getValueFromData(SensorReading data) {
return data.getTemperature().toString();
}
}));
env.execute();
}
}
Elasticsearch
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-elasticsearch6_2.12</artifactId>
<version>1.10.1</version>
</dependency>
public class SinkTest3_es {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//从文件中读取数据
DataStream<String> inputStream = env.readTextFile("/Users/code/testDemo/src/main/resources/sensor.txt");
DataStream<SensorReading> dataStream = inputStream.map(line -> {
String[] fields = line.split(",");
return new SensorReading(fields[0],new Long(fields[1]),new Double(fields[2]));
});
//
ArrayList<HttpHost> httpHosts = new ArrayList<>();
httpHosts.add(new HttpHost("localhost",9200));
//dataStream.addSink(new ElasticsearchSink.Builder<SensorReading>(httpHosts, new MyEsSinkFunction()));
dataStream.addSink((SinkFunction<SensorReading>) new ElasticsearchSink.Builder<SensorReading>(httpHosts,new MyEsSinkFunction()));
env.execute();
}
//实现自定义的ES写入操作
private static class MyEsSinkFunction implements ElasticsearchSinkFunction<SensorReading> {
@Override
public void process(SensorReading element, RuntimeContext ctx, RequestIndexer indexer) {
//定义写入的数据source
HashMap<String, String> dataSource = new HashMap<>();
dataSource.put("id",element.getId());
dataSource.put("tmp",element.getTemperature().toString());
dataSource.put("ts",element.getTimestamp().toString());
//创建请求,作为向es发起的写入命令
IndexRequest source = Requests.indexRequest()
.index("sensor")
.type("readingdata")
.source("dataSorce");
//用index 发送请求
indexer.add(source);
}
}
}
Jdbc
public class SinkTest4_Jdbc {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
//从文件中读取数据
DataStream<String> inputStream = env.readTextFile("/Users/code/testDemo/src/main/resources/sensor.txt");
DataStream<SensorReading> dataStream = inputStream.map(line -> {
String[] fields = line.split(",");
return new SensorReading(fields[0],new Long(fields[1]),new Double(fields[2]));
});
dataStream.addSink(new MyJdbcSink());
env.execute();
}
//实现自定义连接
private static class MyJdbcSink extends RichSinkFunction<SensorReading> {
Connection connection = null;
PreparedStatement insert = null;
PreparedStatement update = null;
@Override
public void open(Configuration parameters) throws Exception {
connection = DriverManager.getConnection("jdbc:mysql://localhost:3306/test","root","root");
insert = connection.prepareStatement("insert into sensor_temp(id,tmp) values(?,?)");
update = connection.prepareStatement("update sensor_temp set temp = ? where id = ?");
}
//每来一条数据,调用连接,执行sql
@Override
public void invoke(SensorReading value, Context context) throws Exception {
//直接执行更新语句,如果没有更新那么就插入
update.setDouble(1,value.getTemperature());
update.setString(2,value.getId());
update.execute();
if (update.getUpdateCount() == 0) {
insert.setString(1,value.getId());
insert.setDouble(2,value.getTemperature());
insert.execute();
}
}
@Override
public void close() throws Exception {
insert.close();
update.close();
connection.close();
}
}
}