程序与数据流
所有的Flink 程序都是由三部分组成:Source,Transformation 和Sink
Source: 负责读取数据源数据
Transformation:利用各种算子进行处理加工
Sink:负责输出。
Source
设置java beans
package com.hadwinling.apitest.beans;
/**
* @description: 传感器温度读数的数据类型
* @author: hadwinling
* @time: 2021/3/28 上午11:24
*/
public class SensorReading {
// 属性 id,时间戳,温度值
private String id ;
private Long timestamp;
private Double temperature;
public SensorReading() {
}
public SensorReading(String id, Long timestamp, Double temperature) {
this.id = id;
this.timestamp = timestamp;
this.temperature = temperature;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public Long getTimestamp() {
return timestamp;
}
public void setTimestamp(Long timestamp) {
this.timestamp = timestamp;
}
public Double getTemperature() {
return temperature;
}
public void setTemperature(Double temperature) {
this.temperature = temperature;
}
@Override
public String toString() {
return "SensorReading{" +
"id='" + id + '\'' +
", timestamp=" + timestamp +
", temperature=" + temperature +
'}';
}
}
Flink 读取数据
从集合读取数据
package com.hadwinling.apitest.source;
import com.hadwinling.apitest.beans.SensorReading;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.Arrays;
/**
* @description:
* @author: hadwinling
* @time: 2021/3/28 上午11:27
*/
public class SourceTest1_Collection {
public static void main(String[] args) throws Exception {
// 创建执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1); // 这里设置并行度 如果没有设置并行度会以flink-conf.yaml 的配置为准,其默认是1
// 从集合中读取数据
DataStream<SensorReading> dataStream = env.fromCollection(Arrays.asList(
new SensorReading("sensor_1", 1547718199L, 35.8),
new SensorReading("sensor_6", 1547718201L, 15.4),
new SensorReading("sensor_7", 1547718202L, 6.7),
new SensorReading("sensor_10", 1547718205L, 38.1)
));
DataStream<Integer> integerDataStream = env.fromElements(1, 2, 4, 67, 189);
// 打印输出
// 这里传参可以用来表示打印输出的区分
dataStream.print("data");
integerDataStream.print("int");
// 执行
env.execute();
}
}
从文件读取数据
package com.hadwinling.apitest.source;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* @description:
* @author: hadwinling
* @time: 2021/3/28 下午3:06
*/
public class SourceTest2_File {
public static void main(String[] args) throws Exception {
// 1. 创建环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);// 设置并行度
// 2. 从文件读取数据
DataStreamSource<String> stringDataStreamSource = env.readTextFile("/home/hadoop/IdeaProjects/BigDataLearn/FlinkDatas/sensor.txt");
// 3. 打印输出
stringDataStreamSource.print();
//4. 执行
env.execute();
}
}
从kafka读取数据
package com.hadwinling.apitest.source;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import java.util.Properties;
/**
* @description:
* @author: hadwinling
* @time: 2021/3/28 下午3:09
*/
public class SourceTest3_Kafka {
public static void main(String[] args) throws Exception {
// 1. 创建环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", "localhost:9092");
properties.setProperty("group.id", "consumer-group");
properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
properties.setProperty("auto.offset.reset", "latest");
// 2.从kafka读取数据 (kafka 从https://blog.csdn.net/Alingyuzi/article/details/115282103 这里查看)
DataStream<String> dataStream = env.addSource( new FlinkKafkaConsumer<String>("dblab", new SimpleStringSchema(), properties));
// 3. 打印输出
dataStream.print();
// 4. 执行
env.execute();
}
}
自定义数据源
package com.hadwinling.apitest.source;
import com.hadwinling.apitest.beans.SensorReading;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import java.util.HashMap;
import java.util.Random;
/**
* @description:
* @author: hadwinling
* @time: 2021/3/28 下午4:09
*/
public class SourceTest4_UDF {
public static void main(String[] args) throws Exception {
// 1. 创建环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1); // 设置并行度
// 2. 从文件中读取数据
DataStreamSource<SensorReading> sensorReadingDataStreamSource = env.addSource(new mySensorSource());
// 3. 打印输出
sensorReadingDataStreamSource.print();
// 4. 执行
env.execute();
}
// 实现自定义的SourceFunction
public static class mySensorSource implements SourceFunction<SensorReading> {
// 定义一个标识位,用来控制数据的产生
private boolean running = true;
@Override
public void run(SourceContext<SensorReading> sourceContext) throws Exception {
// 定义一个随机数发生器
Random random = new Random();
// 设置10 个传感器的初始温度
HashMap<String, Double> sensorTempMap = new HashMap<>();
for (int i = 0; i < 10; i++) {
sensorTempMap.put("sensor_" + (i + 1), 60 + random.nextGaussian() * 20); // 生成高斯分布
}
while (running) {
for (String sensorId :
sensorTempMap.keySet()) {
// 在当前温度基础上随即波动
Double newTemp = sensorTempMap.get(sensorId) +random.nextGaussian();
sensorTempMap.put(sensorId,newTemp);
sourceContext.collect(new SensorReading(sensorId,System.currentTimeMillis(),newTemp));
}
// 控制输出频率
Thread.sleep(1000L);
}
}
@Override
public void cancel() {
running = false;
}
}
}
sensor.txt 文件内容
sensor_1,1547718199,35.8
sensor_6,1547718201,15.4
sensor_7,1547718202,6.7
sensor_10,1547718205,38.1
sensor_1,1547718207,36.3
sensor_1,1547718209,32.8
sensor_1,1547718212,37.1