source之文件数据读取
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
//导入隐式转换,建议写在这里,可以防止IDEA代码提示出错的问题
import org.apache.flink.streaming.api.scala._
object FileReadeSource {
def main(args: Array[String]): Unit = {
//初始化Flink的Streaming(流计算)上下文执行环境
val streamEnv = StreamExecutionEnvironment.getExecutionEnvironment
streamEnv.setParallelism(1)
//读取数据
val dataStream =streamEnv.readTextFile("./data/trafficdata")
val ds = dataStream.map(line=>{
val arr = line.split("\t")
val info = s"${arr(0)}\t${arr(1)}\t${arr(2)}\t${arr(3)}\t${arr(4)}\t${arr(5)}\t${arr(6)}"
info
})
ds.print()
streamEnv.execute("flink start")
}
}
数据结构如下
最终结果:
source之kafka数据读取
普通的string数据
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.flink.streaming.api.scala._
import java.util.Properties
object KafkaSourceWithoutKey {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
//组织配置项
val props = new Properties()
props.setProperty("bootstrap.servers", "172.16.254.4:9092,172.16.254.5:9092,172.16.254.6:9092")
// props.setProperty("bootstrap.servers", "node1:9092,node2:9092,node3:9092")
props.setProperty("key.deserializer", classOf[StringDeserializer].getName)
props.setProperty("value.deserializer", classOf[StringDeserializer].getName)
props.setProperty("group.id", "test01_group")
// props.setProperty("auto.offset.reset", "latest") //也可以不设置,默认是 flinkKafkaConsumer.setStartFromGroupOffsets(),设置了也不会起作用
//读取Kafka中的数据
val lines: DataStream[String] = env.addSource(new FlinkKafkaConsumer[String]("test01", new SimpleStringSchema(), props))
lines.print()
//触发执行
env.execute()
}
}
前往kafka集群进行测试:
结果打印:
标准的key-value数据
import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment, createTuple2TypeInformation}
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, KafkaDeserializationSchema}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.flink.streaming.api.scala._
import java.util.Properties
object KafkaSourceWithKey {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
val props = new Properties()
props.setProperty("bootstrap.servers", "172.16.254.4:9092,172.16.254.5:9092,172.16.254.6:9092")
props.setProperty("key.serializer", classOf[StringDeserializer].getName)
props.setProperty("value.serializer", classOf[StringDeserializer].getName)
props.setProperty("group.id", "test02_group")
// props.setProperty("auto.offset.reset","latest") 设置不设置无所谓,因为可以对FlinkKafkaConsumer设置 从什么位置读取数据
val flinkKafkaConsumer = new FlinkKafkaConsumer[(String, String)]("test02", new KafkaDeserializationSchema[(String, String)] {
override def isEndOfStream(t: (String, String)): Boolean = false //是否流结束
override def deserialize(consumerRecord: ConsumerRecord[Array[Byte], Array[Byte]]): (String, String) = {
var key = "0"
var value = "null"
if (consumerRecord.key() != null) {
key = new String(consumerRecord.key(), "UTF-8")
}
if (consumerRecord.value() != null) {
value = new String(consumerRecord.value, "UTF-8")
}
(key, value)
}
//设置返回的二元组类型 ,createTuple2TypeInformation 需要导入隐式转换
override def getProducedType: TypeInformation[(String, String)] = {
createTuple2TypeInformation(createTypeInformation[String], createTypeInformation[String])
}
}, props)
//设置读取Kafka中的数据从最后开始,默认设置为 setStartFromGroupOffsets
val infos: DataStream[(String, String)] = env.addSource(flinkKafkaConsumer.setStartFromLatest())
//打印结果
infos.print()
//触发执行
env.execute()
}
}
结果:
source之集合数据读取
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
//导入隐式转换,建议写在这里,可以防止IDEA代码提示出错的问题
import org.apache.flink.streaming.api.scala._
case class StationLog(sid: String, callOut: String, callIn: String, callType: String, callTime: Long, duration: Long)
object CollectionSource {
def main(args: Array[String]): Unit = {
//初始化Flink的Streaming(流计算)上下文执行环境
val streamEnv = StreamExecutionEnvironment.getExecutionEnvironment
streamEnv.setParallelism(1)
//读取数据
var dataStream = streamEnv.fromCollection(Array(
new StationLog("001", "186", "189", "busy", 1577071519462L, 0),
new StationLog("002", "186", "188", "busy", 1577071520462L, 0),
new StationLog("003", "183", "188", "busy", 1577071521462L, 0),
new StationLog("004", "186", "188", "success", 1577071522462L, 32)
))
dataStream.print()
streamEnv.execute()
}
}
结果:
source之sockets数据读取
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.scala._
object SocketSource {
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
// get input data
val text: DataStream[String] = env.socketTextStream("127.0.0.1", 6666)
val counts = text.flatMap(_.toLowerCase().split("\\W+"))
.map((_, 1)).keyBy(0).sum(1)
counts.print()
env.execute("Streaming Count")
}
}
结果展示: