Flink 之读取数据
1. 从集合中读取数据
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.api.scala._
// 创建样例类
case class SensorReading(id: String, timestamp: Long, temperature: Double)
/**
* 从集合中读取数据
*/
object collection {
def main(args: Array[String]): Unit = {
// 创建执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// 从集合中读取数据
val stream: DataStream[SensorReading] = env.fromCollection(List(
SensorReading("sensor_1", 1547718199, 35.80018327300259),
SensorReading("sensor_6", 1547718201, 15.402984393403084),
SensorReading("sensor_7", 1547718202, 6.720945201171228),
SensorReading("sensor_10", 1547718205, 38.101067604893444)
))
// 输出结果
stream.print("stream").setParallelism(2)
// 执行任务
env.execute()
}
}
2. 从文件中读取数据
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
/**
* 从文件中读取数据
*/
object File {
def main(args: Array[String]): Unit = {
// 创建执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// 从文件中读取数据
val inputPath = "/Users/zgl/Documents/IdeaProjects/FlinkTutorial/src/sensors.txt"
val stream: DataStream[SensorReading] = env.readTextFile(inputPath)
.map(line => {
val paras = line.split(",")
SensorReading(paras(0).trim, paras(1).trim.toLong, paras(2).trim.toDouble)
})
// 输出结果
stream.print("stream").setParallelism(4)
// 执行任务
env.execute()
}
}
3. 从 Kafka 消息队列中的数据作为来源
-
需要先给项目的
pom.xml
文件中,引入依赖:<dependency> <groupId>org.apache.flink</groupId> <artifactId>flink-connector-kafka-0.11_2.11</artifactId> <version>1.7.2</version> </dependency>
-
具体代码:
import java.util.Properties import org.apache.flink.api.scala._ import org.apache.flink.api.common.serialization.SimpleStringSchema import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment} import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011 /** * 从 Kafka 中获取数据 */ object Kafka { def main(args: Array[String]): Unit = { // 配置信息 val properties = new Properties() properties.setProperty("bootstrap.servers", "hadoop101:9092") properties.setProperty("group.id", "consumer-group") properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer") properties.setProperty("auto.offset.reset","latest") // 创建运行环境 val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment // 从 Kafka 中获取数据 val stream: DataStream[String] = env.addSource(new FlinkKafkaConsumer011[String]("sensor", new SimpleStringSchema(), properties)) // 输出结果 stream.print("stream").setParallelism(1) // 执行任务 env.execute() } }
4. 自定义 Source
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.api.scala._
import scala.util.Random
/**
* 使用自定义 Source
*/
object MySource {
def main(args: Array[String]): Unit = {
// 创建运行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// 添加自定义 Source
val stream: DataStream[SensorReading] = env.addSource(new MySensorSource)
// 输出结果
stream.print().setParallelism(1)
// 执行任务
env.execute()
}
}
/**
* 创建自定义 Source ,继承 SourceFunction 重写方法
*/
class MySensorSource extends SourceFunction[SensorReading] {
// flag:表示数据源是否还在正常运行
var running = true
override def cancel(): Unit = {
running = false
}
override def run(sourceContext: SourceFunction.SourceContext[SensorReading]): Unit = {
val random = new Random()
var curTemp = 1.to(10).map(
i => ("sensor_" + i, 65 + random.nextGaussian() * 20)
)
while (running) {
curTemp = curTemp.map(
t => (t._1, t._2 + random.nextGaussian())
)
val curTime: Long = System.currentTimeMillis()
curTemp.foreach(
t => sourceContext.collect(SensorReading(t._1, curTime, t._2))
)
Thread.sleep(100)
}
}
}