1. 从集合读取数据
// 定义样例类,传感器id,时间戳,温度
case class SensorReading(id: String, timestamp: Long, temperature: Double)
object Sensor {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val stream1 = env
.fromCollection(List(
SensorReading("sensor_1", 1547718199, 35.8),
SensorReading("sensor_6", 1547718201, 15.4),
SensorReading("sensor_7", 1547718202, 6.7),
SensorReading("sensor_10", 1547718205, 38.1)
))
stream1.print("stream1:").setParallelism(1)
env.execute()
}
}
2. 从文件读取数据
def main(args: Array[String]): Unit = {
val env = ExecutionEnvironment.getExecutionEnvironment
val inputDataSet = env.readTextFile("F:\\SparkWorkSpace\\flink-learning\\src\\main\\resources\\word.txt")
val resultDataSet = inputDataSet.flatMap(_.split(" "))
.map((_, 1)).groupBy(0)// 以二元组的的第一个元素作为key
.sum(1) //以聚合元素中第二个元素的值
resultDataSet.print()
}
3. 以kafka消息队列的数据作为来源
POM
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-scala_2.11</artifactId>
<version>1.10.0</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.flink/flink-streaming-scala -->
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-scala_2.11</artifactId>
<version>1.10.0</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-connector-kafka-0.11_2.11</artifactId>
<version>1.10.0</version>
</dependency>
</dependencies>
具体代码如下:
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011
object SourceTest {
def main(args: Array[String]): Unit = {
// 创建执行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
// env.setParallelism(1)
// 从Kafka中读取数据
val properties = new Properties()
properties.setProperty("bootstrap.servers", "hadoop:9091,hadoop:9092,hadoop:9093")
properties.setProperty("group.id", "consumer-group")
properties.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
properties.setProperty("auto.offset.reset", "latest")
val stream2 = env.addSource(new FlinkKafkaConsumer011[String]("GMALL_EVENT_0105", new SimpleStringSchema(),properties) )
println("开始执行")
stream2.print("hello")
println("结束执行")
env.execute("xxxx")
}
}
4. 自定义Source
除了以上的source数据来源,我们还可以自定义source。需要做的,只是传入一个SourceFunction就可以。
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala._
import scala.util.Random
object UserDefinedSourceTest {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
// 调用自定义source
val dataStream: DataStream[SensorReading] = env.addSource(new MySource)
dataStream.print("-->")
env.execute("xxxx")
}
}
case class SensorReading(id:String, timestamp:Long, temperature:Double)
class MySource extends SourceFunction[SensorReading]{
// flg 表示数据源是否还在正常运行
var flg = true
override def cancel(): Unit = flg=false
override def run(sourceContext: SourceFunction.SourceContext[SensorReading]): Unit = {
// 初始化一个随机数
val rand = new Random()
var curTemp = 1.to(10).map{
//高斯随机数,正态分布
i=>("sensor_"+i, 65 + rand.nextGaussian()*20)
}
while (flg){
//更新温度值
curTemp = curTemp.map(
t=>(t._1, t._2+rand.nextGaussian())
)
// 获取当前时间戳
val curTime = System.currentTimeMillis()
curTemp.foreach{
t =>sourceContext.collect(SensorReading(t._1, curTime, t._2))
}
Thread.sleep(1000)
}
}
}