Flink 做为一款流式计算框架,它可用来做批处理,即处理静态的数据集、历史的数据集;也可以用来做流处理,即实时的处理些实时数据流,实时的产生数据流结果,只要数据源源不断的过来,Flink 就能够一直计算下去,这个 Data Sources 就是数据的来源地。
一. 基于集合
1. fromCollection(Collection)
从 Collection 创建数据流。集合中的所有元素类型必须相同。
-
scala代码
package com.hjf.dataSource import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment object DataSourceForCollection { def main(args: Array[String]): Unit = { val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment streamEnv.setParallelism(1) import org.apache.flink.streaming.api.scala._ val stream: DataStream[String] = streamEnv.fromCollection(Array( "张三", "李四", "张三", "王五" )) stream.map((_, 1)).keyBy(0).sum(1).print() streamEnv.execute() } }
-
java代码
package com.hjf.source; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; import java.util.Arrays; public class DataSourceForCollection { public static void main(String[] args) throws Exception { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); streamEnv.setParallelism(1); DataStreamSource<String> stream = streamEnv.fromCollection(Arrays.asList("张三", "李四", "张三", "王五")); stream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() { @Override public void flatMap(String s, Collector<Tuple2<String, Integer>> out) throws Exception { String[] arr = s.split(" "); for (String ele: arr) { if (ele.length() > 0) { out.collect(new Tuple2<String, Integer>(ele, 1)); } } } }).keyBy(0).sum(1).print(); streamEnv.execute("collect"); } }
-
运行结果
2. fromElements(T …)
从给定的对象序列中创建数据流。所有对象类型必须相同。
-
scala代码
package com.hjf.dataSource import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment object DataSourceForCollection { def main(args: Array[String]): Unit = { val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment streamEnv.setParallelism(1) import org.apache.flink.streaming.api.scala._ val stream: DataStream[String] = streamEnv.fromElements("hello", "word", "spark", "word") stream.map((_, 1)).keyBy(0).sum(1).print() streamEnv.execute() } }
-
java代码
package com.hjf.source; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; public class DataSourceForElement { public static void main(String[] args) throws Exception { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); streamEnv.setParallelism(1); DataStreamSource<String> stream = streamEnv.fromElements("张三", "李四", "张三", "王五"); stream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() { @Override public void flatMap(String s, Collector<Tuple2<String, Integer>> out) throws Exception { String[] arr = s.split(" "); for (String ele: arr) { if (ele.length() > 0) { out.collect(new Tuple2<String, Integer>(ele, 1)); } } } }).keyBy(0).sum(1).print(); streamEnv.execute("collect"); } }
-
运行结果:
二. 基于文件
1. 本地文件
-
scala代码
package com.hjf.dataSource import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment object DataSourceForFile { def main(args: Array[String]): Unit = { val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment streamEnv.setParallelism(1) import org.apache.flink.streaming.api.scala._ val inputPath: String = getClass.getResource("/data.txt").getPath // 如果这里面直接填路径, 则需要填写绝对路径 val stream: DataStream[String] = streamEnv.readTextFile(inputPath) stream.flatMap(_.split(" ")).map((_, 1)).keyBy(0).sum(1).print(