Flink 01 之 数据源(DataStream)
1.1 source简介
source是程序的数据源输入,你可以通过StreamExecutionEnvironment.addSource(sou
rceFunction)
来为你的程序添加一个source。
flink提供了大量的已经实现好的source方法,也可以自定义source: 1. 通过实现sourceFunction接口来自定义无并行度的source
2. 通过实现ParallelSourceFunction 接口 or 继承RichParallelSourceFunction 来自定义有并行度的
source。
获取source的方式
(1)基于文件 readTextFile(path) 读取文本文件,文件遵循TextInputFormat 读取规则,逐行读取并返回。
(2)基于socket socketTextStream 从socker中读取数据,元素可以通过一个分隔符切开。
(3)基于集合 fromCollection(Collection) 通过java 的collection集合创建一个数据流,集合中的所有元素必须是相同类型的。
(4)扩展数据源 addSource 可以实现读取第三方数据源的数据 系统内置提供了一批connectors,连接器会提供对应的source支持【kafka】
扩展的数据源
Apache Kafka (source/sink) 后面重点分析
Apache Cassandra (sink)
Amazon Kinesis Streams (source/sink)
Elasticsearch (sink)
Hadoop FileSystem (sink)
RabbitMQ (source/sink)
Apache NiFi (source/sink)
Twitter Streaming API (source)
1.2 数据源之collection
public class StreamingSourceFromCollection {
public static void main(String[] args) throws Exception {
//步骤一:获取环境变量 StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//步骤二:模拟数据
ArrayList<String> data = new ArrayList<String>();
data.add("hadoop");
data.add("spark");
data.add("flink");
//步骤三:获取数据源 DataStreamSource<String> dataStream = env.fromCollection(data);
//步骤四:transformation操作
SingleOutputStreamOperator<String> addPreStream = dataStream.map(new MapFunction<String, String>() {
@Override public String map(String word) throws Exception {
return "kaikeba_" + word;
}
});
//步骤五:对结果进行处理(打印)
addPreStream.print().setParallelism(1);
//步骤六:启动程序
env.execute("StreamingSourceFromCollection"); } }
1.3 自定义单并行度数据源
/**
* 注意:指定数据类型
* 功能:每秒产生一条数据
*/
public class MyNoParalleSource implements SourceFunction<Long> {
private long number = 1L;
private boolean isRunning = true;
@Override public void run(SourceContext<Long> sct) throws Exception {
while (isRunning){
sct.collect(number);
number++;
//每秒生成一条数据
Thread.sleep(1000);
}
}@Override public void cancel() {
isRunning=false;
}
}
/**
* 功能:从自定义的数据数据源里面获取数据,然后过滤出偶数
*/
public class StreamingDemoWithMyNoPralalleSource {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<Long> numberStream = env.addSource(new MyNoParalleSource()).setParallelism(1);
SingleOutputStreamOperator<Long> dataStream = numberStream.map(new MapFunction<Long, Long>() {
@Override public Long map(Long value) throws Exception {
System.out.println("接受到了数据:"+value); return value; }
}
);
SingleOutputStreamOperator<Long> filterDataStream = dataStream.filter(new FilterFunction<Long>() {
@Override public boolean filter(Long number) throws Exception {
return number % 2 == 0;
}
}
);
filterDataStream.print().setParallelism(1); env.execute("StreamingDemoWithMyNoPralalleSource");
}
}
1.4 自定义多并行度数据源
/*** 功能:自定义支持并行度的数据源 * 每秒产生一条数据 */
public class MyParalleSource implements ParallelSourceFunction<Long> {
private long number = 1L;
private boolean isRunning = true;
@Override public void run(SourceContext<Long> sct) throws Exception {
while (isRunning){
sct.collect(number);
number++;
//每秒生成一条数据
Thread.sleep(1000);
}
}
@Override public void cancel() {
isRunning=false;
}
}
public class StreamingDemoWithMyPralalleSource {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource<Long> numberStream = env.addSource(new MyParalleSource()).setParallelism(2);
SingleOutputStreamOperator<Long> dataStream = numberStream.map(new MapFunction<Long, Long>() {
@Override public Long map(Long value) throws Exception {
System.out.println("接受到了数据:"+value);
return value; } });
SingleOutputStreamOperator<Long> filterDataStream = dataStream.filter(new FilterFunction<Long>() {
@Override
public boolean filter(Long number) throws Exception {
return number % 2 == 0;
}
}
);
filterDataStream.print().setParallelism(1); env.execute("StreamingDemoWithMyNoPralalleSource");
}
}