一、介绍
一些比较基本的 Source 和 Sink 已经内置在 Flink 里。 预定义 data sources 支持从文件、目录、socket,以及 collections 和 iterators 中读取数据。 预定义 data sinks 支持把数据写入文件、标准输出(stdout)、标准错误输出(stderr)和 socket
二、常用Source
预定义 data sources 支持从文件、目录、socket,以及 collections 和 iterators 中读取数据
- list
package com.xx.common.study.source;
import com.xx.common.study.domain.WaterSensor;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import java.util.ArrayList;
import java.util.List;
/**
* @author xiaxing
* @describe 从集合中读取数据
* @since 2024/5/14 16:35
*/
public class ListSourceDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
List<WaterSensor> list = new ArrayList<>();
list.add(new WaterSensor("s1", 1L, 1));
list.add(new WaterSensor("s2", 2L, 2));
list.add(new WaterSensor("s3", 3L, 3));
// 从集合中读取数据,自动识别数据类型
DataStreamSource<WaterSensor> collectionStream = env.fromCollection(list);
// 从集合中读取数据,手动指定数据类型
DataStreamSource<WaterSensor> collectionStream1 = env.fromCollection(list, TypeInformation.of(WaterSensor.class));
collectionStream.print("从集合中读取数据");
collectionStream1.print("从集合中读取数据");
env.execute();
}
}
- file
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-files</artifactId>
<version>${flink.version}</version>
</dependency>
package com.xx.common.study.source;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.connector.file.src.FileSource;
import org.apache.flink.connector.file.src.reader.TextLineInputFormat;
import org.apache.flink.core.fs.Path;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* @author xiaxing
* @describe 从文件中读取数据
* @since 2024/5/14 16:47
*/
public class FileSourceDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
FileSource<String> fileSource = FileSource
// 这里可以是文件目录、文件、HDFS目录
.forRecordStreamFormat(new TextLineInputFormat(), new Path("input/word.txt"))
.build();
env.fromSource(fileSource, WatermarkStrategy.noWatermarks(),"file").print();
env.execute();
}
}
- socket
package com.xx.common.study.source;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* @author xiaxing
* @describe 从Socket中读取数据
* @since 2024/5/14 16:57
*/
public class SocketSourceDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStream<String> socketStream = env.socketTextStream("localhost", 7777);
socketStream.print();
env.execute();
}
}
- kafka
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka</artifactId>
<version>${flink.version}</version>
</dependency>
package com.xx.common.study.source;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.connector.kafka.source.KafkaSource;
import org.apache.flink.connector.kafka.source.enumerator.initializer.OffsetsInitializer;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* @author xiaxing
* @describe 从Kafka中读取数据
* @since 2024/5/14 16:59
*/
public class KafkaSourceDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
KafkaSource<String> kafkaSource = KafkaSource.<String>builder()
.setBootstrapServers("localhost:9092")
.setTopics("topic")
.setGroupId("groupId")
.setStartingOffsets(OffsetsInitializer.latest())
.setValueOnlyDeserializer(new SimpleStringSchema())
.build();
DataStreamSource<String> stream = env.fromSource(kafkaSource, WatermarkStrategy.noWatermarks(), "kafka-source");
stream.print();
env.execute();
}
}
- DataGen
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-datagen</artifactId>
<version>${flink.version}</version>
</dependency>
DataGen连接器提供了一个Source实现,允许为Flink管道生成输入数据
package com.xx.common.study.source;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.connector.source.util.ratelimit.RateLimiterStrategy;
import org.apache.flink.connector.datagen.source.DataGeneratorSource;
import org.apache.flink.connector.datagen.source.GeneratorFunction;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
/**
* @author xiaxing
* @describe DataGen 连接器
* @since 2024/5/14 17:27
*/
public class SourceDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataGeneratorSource<String> dataGeneratorSource =
new DataGeneratorSource<>(
// 生成数据的具体实现类
new GeneratorFunction<Long, String>() {
@Override
public String map(Long value) {
return "Number:" + value;
}
},
// 生成数据行数
Long.MAX_VALUE,
// 指定发射速率(每秒发射的记录数)
RateLimiterStrategy.perSecond(10),
// 指定返回值类型
Types.STRING
);
env.fromSource(dataGeneratorSource, WatermarkStrategy.noWatermarks(), "data_generator").print();
env.execute();
}
}