Flink 做为一款流式计算框架,它可用来做批处理,即处理静态的数据集、历史的数据集;也可以用来做流处理,即实时的处理些实时数据流,实时的产生数据流结果,只要数据源源不断的过来,Flink 就能够一直计算下去,这个 Data Sources 就是数据的来源地。
一. 基于集合
1. fromCollection(Collection)
从 Collection 创建数据流。集合中的所有元素类型必须相同。
-
scala代码
package com.hjf.dataSource import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment object DataSourceForCollection { def main(args: Array[String]): Unit = { val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment streamEnv.setParallelism(1) import org.apache.flink.streaming.api.scala._ val stream: DataStream[String] = streamEnv.fromCollection(Array( "张三", "李四", "张三", "王五" )) stream.map((_, 1)).keyBy(0).sum(1).print() streamEnv.execute() } }
-
java代码
package com.hjf.source; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; import java.util.Arrays; public class DataSourceForCollection { public static void main(String[] args) throws Exception { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); streamEnv.setParallelism(1); DataStreamSource<String> stream = streamEnv.fromCollection(Arrays.asList("张三", "李四", "张三", "王五")); stream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() { @Override public void flatMap(String s, Collector<Tuple2<String, Integer>> out) throws Exception { String[] arr = s.split(" "); for (String ele: arr) { if (ele.length() > 0) { out.collect(new Tuple2<String, Integer>(ele, 1)); } } } }).keyBy(0).sum(1).print(); streamEnv.execute("collect"); } }
-
运行结果
2. fromElements(T …)
从给定的对象序列中创建数据流。所有对象类型必须相同。
-
scala代码
package com.hjf.dataSource import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment object DataSourceForCollection { def main(args: Array[String]): Unit = { val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment streamEnv.setParallelism(1) import org.apache.flink.streaming.api.scala._ val stream: DataStream[String] = streamEnv.fromElements("hello", "word", "spark", "word") stream.map((_, 1)).keyBy(0).sum(1).print() streamEnv.execute() } }
-
java代码
package com.hjf.source; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.datastream.DataStreamSource; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; public class DataSourceForElement { public static void main(String[] args) throws Exception { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); streamEnv.setParallelism(1); DataStreamSource<String> stream = streamEnv.fromElements("张三", "李四", "张三", "王五"); stream.flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() { @Override public void flatMap(String s, Collector<Tuple2<String, Integer>> out) throws Exception { String[] arr = s.split(" "); for (String ele: arr) { if (ele.length() > 0) { out.collect(new Tuple2<String, Integer>(ele, 1)); } } } }).keyBy(0).sum(1).print(); streamEnv.execute("collect"); } }
-
运行结果:
二. 基于文件
1. 本地文件
-
scala代码
package com.hjf.dataSource import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment object DataSourceForFile { def main(args: Array[String]): Unit = { val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment streamEnv.setParallelism(1) import org.apache.flink.streaming.api.scala._ val inputPath: String = getClass.getResource("/data.txt").getPath // 如果这里面直接填路径, 则需要填写绝对路径 val stream: DataStream[String] = streamEnv.readTextFile(inputPath) stream.flatMap(_.split(" ")).map((_, 1)).keyBy(0).sum(1).print() streamEnv.execute() } }
-
Java代码
package com.hjf.source; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; public class DataSourceForFile { public static void main(String[] args) throws Exception { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); streamEnv.readTextFile("data/words.txt") .flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() { @Override public void flatMap(String s, Collector<Tuple2<String, Integer>> out) throws Exception { String[] arr = s.split(" "); for (String ele: arr) { if (ele.length() > 0) { out.collect(new Tuple2(ele, 1)); } } } }).keyBy(0).sum(1).print(); streamEnv.execute(); } }
2. HDFS
-
scala代码
package com.hjf.dataSource import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment object DataSourceForHDFS { def main(args: Array[String]): Unit = { val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment streamEnv.setParallelism(1) import org.apache.flink.streaming.api.scala._ val stream: DataStream[String] = streamEnv.readTextFile("hdfs://node02:8020/hadoop/words.txt") stream.flatMap(_.split(" ")).map((_, 1)).keyBy(0).sum(1).print() streamEnv.execute() } }
-
Java代码
package com.hjf.source; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; public class DataSourceForHDFS { public static void main(String[] args) throws Exception { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); streamEnv.setParallelism(1); streamEnv.readTextFile("hdfs://node01:8020/hadoop/words.txt") .flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() { @Override public void flatMap(String s, Collector<Tuple2<String, Integer>> out) throws Exception { String[] arr = s.split(" "); for (String ele: arr) { if (ele.length() > 0) { out.collect(new Tuple2<String, Integer>(ele, 1)); } } } }).keyBy(0).sum(1).print(); streamEnv.execute(); } }
三. 基于socket
socketTextStream(String hostname, int port) - 从 socket 读取。元素可以用分隔符切分。
-
scala代码:
package com.hjf import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment object StreamWordCount { def main(args: Array[String]): Unit = { val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment streamEnv.setParallelism(1) import org.apache.flink.streaming.api.scala._ val stream: DataStream[String] = streamEnv.socketTextStream("node01", 8888) DataStream[(String, Int)] = stream.flatMap(_.split(" ")).map((_, 1)).keyBy(0).sum(1).print() streamEnv.execute() } }
-
java代码
package com.hjf.source; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.util.Collector; public class DataSourceForSocket { public static void main(String[] args) throws Exception { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); streamEnv.setParallelism(1); streamEnv.socketTextStream("node01", 8888) .flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() { @Override public void flatMap(String s, Collector<Tuple2<String, Integer>> out) throws Exception { String[] arr = s.split(" "); for (String ele: arr) { if (ele.length() > 0) { out.collect(new Tuple2(ele, 1)); } } } }).keyBy(0).sum(1).print(); streamEnv.execute(); } }
四. 基于Kafka
1. 读取kafka中的普通数据(String)
-
scala代码
import java.util.Properties import org.apache.flink.api.common.serialization.SimpleStringSchema import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer import org.apache.kafka.common.serialization.StringDeserializer /** * @author Jiang锋时刻 * @create 2020-07-10 16:01 */ object DataSourceForKafkaByString { def main(args: Array[String]): Unit = { val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment streamEnv.setParallelism(1) import org.apache.flink.streaming.api.scala._ val props: Properties = new Properties() props.setProperty("bootstrap.servers", "node01:9092,node02:9092,node03:9092") props.setProperty("group.id", "fink01") // 反序列化 props.setProperty("key.deserializer", classOf[StringDeserializer].getName) props.setProperty("value.deserializer", classOf[StringDeserializer].getName) // 从哪里开始读 props.setProperty("auto.offset.reset", "latest") val stream: DataStream[String] = streamEnv.addSource( new FlinkKafkaConsumer[String]("test01", new SimpleStringSchema(), props)) val result: DataStream[(String, Int)] = stream.flatMap(_.split(" ")).map((_, 1)).keyBy(0).sum(1) result.print() streamEnv.execute() } }
-
Java代码
package com.hjf.source; import org.apache.flink.api.common.functions.FlatMapFunction; import org.apache.flink.api.common.serialization.SimpleStringSchema; import org.apache.flink.api.java.tuple.Tuple2; import org.apache.flink.streaming.api.TimeCharacteristic; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer; import org.apache.flink.util.Collector; import java.util.Properties; /** * @author Jiang锋时刻 * @create 2020-07-17 16:23 */ public class DataSourceForKafkaByString { public static void main(String[] args) throws Exception { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); streamEnv.setParallelism(1); Properties props = new Properties(); props.setProperty("bootstrap.servers", "node01:9092,node02:9092,node03:9092"); props.setProperty("group.id", "flink2"); // 反序列化 props.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); props.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer"); FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<String>("test1", new SimpleStringSchema(), props); // 从topic中最初的数据开始消费 consumer.setStartFromLatest(); // 从topic中指定的时间点开始消费, 指定时间点之前的数据忽略 // consumer.setStartFromTimestamp(123454532L); // 从topic中指定的offset开始,这个比较复杂,需要手动指定offset // consumer.setStartFromSpecificOffsets(offsets); // 从topic中指定的group上次消费的位置开始消费,所以必须配置group.id参数 // consumer.setStartFromGroupOffsets(); streamEnv.setStreamTimeCharacteristic(TimeCharacteristic.EventTime); streamEnv.addSource(consumer) .flatMap(new FlatMapFunction<String, Tuple2<String, Integer>>() { @Override public void flatMap(String s, Collector<Tuple2<String, Integer>> out) throws Exception { String[] arr = s.split(" "); for (String ele: arr) { if (ele.length() > 0) { out.collect(new Tuple2<String, Integer>(ele, 1)); } } } }).keyBy(0).sum(1).print(); streamEnv.execute(); } }
-
启动kafka
kafka-server-start.sh /opt/hjf/kafka/config/server.properties # 自己编写的脚本 ./startkafka.sh
-
创建kafka生产者
kafka-console-producer.sh --broker-list node01:9092,node02: 9092,node03:9092 --topic test01
-
运行结果:
2. 读取kafka中的key-value数据
-
代码:
package com.hjf.dataSource import java.util.Properties import org.apache.flink.api.common.typeinfo.TypeInformation import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, KafkaDeserializationSchema} import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.common.serialization.StringDeserializer import org.apache.flink.streaming.api.scala._ object DataSourceForKafkaByKeyValue { def main(args: Array[String]): Unit = { val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment streamEnv.setParallelism(1) val props: Properties = new Properties() props.setProperty("bootstrap.servers", "node01:9092,node02:9092,node03:9092") props.setProperty("group.id", "flink02") // 反序列化 props.setProperty("key.deserializer", classOf[StringDeserializer].getName) props.setProperty("value.deserializer", classOf[StringDeserializer].getName) // 从哪里开始读 props.setProperty("auto.offset.reset", "latest") val stream: DataStream[(String, String)] = streamEnv.addSource( new FlinkKafkaConsumer[(String, String)]("test03", new KafkaDeserializationSchema[(String, String)] { // 是否是流结束 override def isEndOfStream(t: (String, String)): Boolean = false override def deserialize(consumerRecord: ConsumerRecord[Array[Byte], Array[Byte]]): (String, String) = { if (consumerRecord != null) { var key = "null" var value = "null" if (consumerRecord.key() != null) { key = new String(consumerRecord.key(), "UTF-8") } if (consumerRecord.value() != null) { value = new String(consumerRecord.value(), "UTF-8") } (key, value) } else { ("null", "null") } } override def getProducedType: TypeInformation[(String, String)] = { createTuple2TypeInformation(createTypeInformation[String], createTypeInformation[String]) } }, props)) stream.print() streamEnv.execute() } }
-
运行结果:
五. 自定义源
通过实现 SourceFunction 接口来自定义无并行度(也就是并行度只能为 1)的 Source。
通过实现 ParallelSourceFunction 接口或者继承 RichParallelSourceFunction 来自定义有并行度的数据源。
-
scala代码
package com.hjf.dataSource import org.apache.flink.streaming.api.functions.source.SourceFunction import org.apache.flink.streaming.api.functions.source.SourceFunction.SourceContext import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment import scala.util.Random object DataSourceForCustomer { def main(args: Array[String]): Unit = { val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment streamEnv.setParallelism(1) import org.apache.flink.streaming.api.scala._ val stream: DataStream[StationLog] = streamEnv.addSource(new CustomerSource) stream.print() streamEnv.execute() } } case class StationLog(sid:String, callOut:String, callIn:String, callType:String, callTime:Long, duration:Long) // 自定义Source源 class CustomerSource extends SourceFunction[StationLog] { // 是否终止数据流的标记 var flag = true /** * 启动一个Source * 大部分情况下, 都需要在这个run方法中实现一个循环, 这样就可以循环产生数据了 * @param sourceContext */ override def run(sourceContext: SourceContext[StationLog]): Unit = { val random: Random = new Random() val types: Array[String] = Array("fail", "busy", "barring", "success") while(flag) { 1.to(10).map(one => { var callOut = "1860000%04d".format(random.nextInt(10000)) var callIn = "1770000%04d".format(random.nextInt(10000)) StationLog("station_" + random.nextInt(10), callOut, callIn, types(random.nextInt(4)), System.currentTimeMillis(), 0) }).foreach(sourceContext.collect(_)) // 发数据 Thread.sleep(2000) } } // 终止数据流 override def cancel(): Unit = { flag = false } }
-
java代码
package com.hjf.source.mysql; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.source.SourceFunction; import java.util.Random; public class DataSourceForConsumer { public static void main(String[] args) throws Exception { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); streamEnv.setParallelism(1); streamEnv.addSource(new ConsumerSource()).print(); streamEnv.execute(); } } class StationLog{ private String sid; private String callOut; private String callIn; private String callType; private Long callTime; private Long duration; public StationLog(String sid, String callOut, String callIn, String callType, Long callTime, Long duration) { this.sid = sid; this.callOut = callOut; this.callIn = callIn; this.callType = callType; this.callTime = callTime; this.duration = duration; } @Override public String toString() { return "StationLog(" + "sid='" + sid + '\'' + ", callOut='" + callOut + '\'' + ", callIn='" + callIn + '\'' + ", callType='" + callType + '\'' + ", callTime=" + callTime + ", duration=" + duration + ')'; } } // 自定义Mysql源 class ConsumerSource implements SourceFunction<StationLog> { boolean flag = true; @Override public void run(SourceContext<StationLog> out) throws Exception { Random random = new Random(); String[] types = new String[]{"fail", "busy", "barring", "success"}; while (flag) { for (int i = 0; i < 10; i++) { String callOut = String.format("1860000%04d", random.nextInt(10000)); String callIn = String.format("1770000%04d", random.nextInt(10000)); StationLog stationLog = new StationLog("station_" + random.nextInt(10), callOut, callIn, types[random.nextInt(4)], System.currentTimeMillis(), 23L); out.collect(stationLog); } Thread.sleep(2000L); } } @Override public void cancel() { flag = false; } }
-
运行结果
六. 基于MySQL
-
scala代码
package com.hjf.dataSource import org.apache.flink.configuration.Configuration import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction} import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet} object DataSourceForMySQL { def main(args: Array[String]): Unit = { val streamEnv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment import org.apache.flink.streaming.api.scala._ streamEnv.addSource(new MySQLSource()).print() streamEnv.execute() } class MySQLSource() extends RichSourceFunction[Student] { var conn: Connection = _ var ps: PreparedStatement = _ // 连接MySQL, 并查询 override def open(parameters: Configuration): Unit = { val driver = "com.mysql.jdbc.Driver" val url = "jdbc:mysql://node01:3306/test" val username = "root" val password = "123456" Class.forName(driver) conn = DriverManager.getConnection(url, username, password) val sql = "select * from student" ps = conn.prepareStatement(sql) } // 断开连接, 释放资源 override def close(): Unit = { if (conn != null) { conn.close() } if (ps != null) { ps.close() } } override def run(out: SourceFunction.SourceContext[Student]): Unit = { val resultSet: ResultSet = ps.executeQuery() while (resultSet.next()) { val student: Student = new Student( resultSet.getInt("id"), resultSet.getString("name"), resultSet.getString("password"), resultSet.getInt("age") ) out.collect(student) } } override def cancel(): Unit = {} } // student类 case class Student(id: Int, name:String, password: String, age: Int) }
-
java代码
package com.hjf.source; import org.apache.flink.configuration.Configuration; import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment; import org.apache.flink.streaming.api.functions.source.RichSourceFunction; import java.sql.*; public class DataSourceForMysql { public static void main(String[] args) throws Exception { StreamExecutionEnvironment streamEnv = StreamExecutionEnvironment.getExecutionEnvironment(); streamEnv.addSource(new MySQLSource()).print(); streamEnv.execute(); } } class Student{ private int id; private String name; private String password; private int age; public Student(int id, String name, String password, int age) { this.id = id; this.name = name; this.password = password; this.age = age; } @Override public String toString() { return "Student{" + "id=" + id + ", name='" + name + '\'' + ", password='" + password + '\'' + ", age=" + age + '}'; } } // 自定义Mysql源 class MySQLSource extends RichSourceFunction<Student> { private Connection conn; private PreparedStatement ps; @Override public void open(Configuration parameters) throws Exception { conn = getConnection(); String sql = "select * from student;"; ps = conn.prepareStatement(sql); } private Connection getConnection() throws ClassNotFoundException, SQLException { Connection connection = null; Class.forName("com.mysql.jdbc.Driver"); connection = DriverManager.getConnection("jdbc:mysql://node01:3306/test", "root", "123456"); return connection; } @Override public void close() throws Exception { if (conn != null) { conn.close(); } if (ps != null) { ps.close(); } } @Override public void run(SourceContext<Student> out) throws Exception { ResultSet resultSet = ps.executeQuery(); while (resultSet.next()) { Student student = new Student( resultSet.getInt("id"), resultSet.getString("name"), resultSet.getString("password"), resultSet.getInt("age") ); out.collect(student); } } @Override public void cancel() { } }
六. 几种Source的特点
- 基于集合:有界数据集,更偏向于本地测试用
- 基于文件:适合监听文件修改并读取其内容
- 基于 Socket:监听主机的 host port,从 Socket 中获取数据
- 自定义 addSource:大多数的场景数据都是无界的,会源源不断的过来。比如去消费 Kafka 某个 topic 上的数据,这时候就需要用到这个 addSource
声明:
- 本文参考了尚学堂Flink课程的课件
- 本文参考了博客: Flink 从 0 到 1 学习 —— Data Source 介绍
- 本文参考了博客: Flink消费Kafka数据时指定offset的五种方式