学习目标
1.入门案例(掌握)
2.DataStream的输入数据集source(掌握)
3.DataStream的Transformation(掌握)
4.DataStream数据输出sink(掌握)
1.入门案例
def main(args: Array[String]): Unit = {
/**
* 实现思路:
* 1. 获取流处理运行环境
* 2. 构建socket流数据源, 并指定IP地址和端口号
* 3. 对接收到的数据转换成单词元组
* 4. 使用keyBy 进行分流(分组)
* 5. 使用timeWindow 指定窗口的长度(每5秒计算一次)
* 6. 使用sum执行累加
* 7. 打印输出
* 8. 启动执行
* 9. 在Linux中, 使用nc -lk 端口号监听端口, 并发送单词
*/
//1.创建流处理的执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//2.构建数据源 使用的socket
val socketDataStream: DataStream[String] = env.socketTextStream("node01",9999)
//3.数据的处理
import org.apache.flink.api.scala._
//4.对接收到的数据转换成单词元组,使用keyBy 进行分流(分组)
val groupKeyedStream: KeyedStream[(String, Int), Tuple] = socketDataStream.flatMap(x=>x.split(",")).map((_,1)).keyBy(0)
//5.使用timeWindow 指定窗口的长度(每5秒计算一次)
val windowedStream: WindowedStream[(String, Int), Tuple, TimeWindow] = groupKeyedStream.timeWindow(Time.seconds(5))
//6.使用sum执行累加
val resultDataStream: DataStream[(String, Int)] = windowedStream.sum(1)
//7.打印数据
resultDataStream.print()
//8.执行程序
env.execute("StreamWordCount")
}
2.DataStream的输入数据集source
分为4类:
基于本地集合的 source( Collection-based-source)
基于文件的 source( File-based-source) - 读取文本文件, 即符合 TextInputFormat 规范 的文件, 并将其作为字符串返回
基于网络套接字的 source( Socket-based-source) - 从 socket 读取。 元素可以用分隔符切分。
自定义的 source( Custom-source
基于kafka的Source
2.1基于本地集合的 source
CollectionSource
ElementsSource
GenerateSequence
2.2基于文件的source
package com.czxy.flink.stream.source.file
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
//基于文件构建数据源
object StreamFromFileSource {
def main(args: Array[String]): Unit = {
//1.创建执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//2.构建数据源
val fileDataStream: DataStream[String] = env.readTextFile("day03/data/input/wordcount.txt")
//3.输出打印
fileDataStream.print()
//4.执行程序
env.execute("StreamFromFileSource")
}
}
2.3基于socket的source
参考入门案例
2.4自定义source
2.4.1SourceFunction: 创建非并行数据源
package com.czxy.flink.stream.source.customer
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
//自定义非并行数据源
object StreamCustomerNoParallelSource {
def main(args: Array[String]): Unit = {
//1.创建执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//2.构建数据源
import org.apache.flink.api.scala._
val NoParallelDataStream: DataStream[Long] = env.addSource( new NoParallelSource()).setParallelism(1)
//3.打印输出
NoParallelDataStream.print()
//4.执行程序
env.execute("StreamCustomerNoParallelSource")
}
//实现一个单线程的,数据从1开始递增的数据集
class NoParallelSource extends SourceFunction[Long]() {
var number:Long=1L
var isRunning:Boolean=true
override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
while (isRunning){
ctx.collect(number)
number+=1
Thread.sleep(1)
if (number>5){
cancel()
}
}
}
override def cancel(): Unit = {
isRunning=false
}
}
}
2.4.2ParallelSourceFunction: 创建并行数据源
package com.czxy.flink.stream.source.customer
import org.apache.flink.streaming.api.functions.source.{ParallelSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
object StreamCustomerParallelSource {
def main(args: Array[String]): Unit = {
//1.创建执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//2.构建数据源
import org.apache.flink.api.scala._
val parallelSourceDataStream: DataStream[Long] = env.addSource(new ParallelSource()).setParallelism(2)
//3.打印输出
parallelSourceDataStream.print()
//4.执行 程序
env.execute("StreamCustomerParallelSource")
}
//创建一个并行度为1的数据源
//实现从1开始产生递增数字
class ParallelSource extends ParallelSourceFunction[Long]() {
var number: Long = 1L
var isRunning: Boolean = true
override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
while (isRunning) {
ctx.collect(number)
number += 1
Thread.sleep(1)
if (number > 5) {
cancel()
}
}
}
override def cancel(): Unit = {
isRunning = false
}
}
}
2.4.3RichParallelSourceFunction: 创建并行数据源
package com.czxy.flink.stream.source.customer
import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
//自定义扩展类的并行数据源
object StreamCustomerRichParallelSource {
def main(args: Array[String]): Unit = {
//1.创建执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//2.添加数据源
import org.apache.flink.api.scala._
val richParallelSourceDataStream: DataStream[Long] = env.addSource(new RichParallelSource()).setParallelism(2)
//3.打印输出
richParallelSourceDataStream.print()
//4.执行程序
env.execute("StreamCustomerRichParallelSource")
}
//自定义的数据源,实现从1开递增的数据集
class RichParallelSource extends RichParallelSourceFunction[Long]() {
var number: Long = 1L
var isRunning: Boolean = true
override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
while (isRunning) {
ctx.collect(number)
number += 1
Thread.sleep(1)
if (number > 5) {
cancel()
}
}
}
override def cancel(): Unit = {
isRunning = false
}
}
}
2.4.4基于 kafka 的 source
在这里插入代码片package com.czxy.flink.stream.source.customer
import java.util.Properties
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011
import org.apache.flink.streaming.util.serialization.SimpleStringSchema
object StreamKafkaSource {
def main(args: Array[String]): Unit = {
//1.创建执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//2.设置Kafka
val topic="test"
val props = new Properties
props.setProperty("bootstrap.servers", "node01:9092")
props.setProperty("group.id", "test01")
props.setProperty("key.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer")
props.setProperty("value.deserializer",
"org.apache.kafka.common.serialization.StringDeserializer")
val consumer: FlinkKafkaConsumer011[String] = new FlinkKafkaConsumer011[String](topic,new SimpleStringSchema(),props)
//设置从最新的数据进行消费
consumer.setStartFromLatest()
import org.apache.flink.api.scala._
//2.添加数据
val kafkaSource: DataStream[String] = env.addSource(consumer)
//3.打印输出
kafkaSource.print()
//4.执行程序
env.execute("StreamKafkaSource")
}
}
2.4.5基于 mysql 的 source
package com.czxy.flink.stream.source.customer
import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
//自定义从mysql获取数据源
object StreamFromMysqlSource {
case class Student(stuId: Int, stuName: String, stuAddr: String, stuSex: String)
def main(args: Array[String]): Unit = {
//1.创建执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
//2.添加数据源
val mysqlSource: DataStream[Student] = env.addSource(new MysqlSource())
//3.打印输出
mysqlSource.print()
//4.执行程序
env.execute("StreamFromMysqlSource")
}
class MysqlSource extends RichSourceFunction[Student]() {
//声明一些对象
var connection: Connection = null
var ps: PreparedStatement = null
//这个方法在初始化的时候被执行一次
override def open(parameters: Configuration): Unit = {
val driver = "com.mysql.jdbc.Driver"
val url = "jdbc:mysql://localhost:3306/test"
val username = "root"
val password = "root"
Class.forName(driver)
connection = DriverManager.getConnection(url, username, password)
val sql =
"""
|select id,name,addr,sex
|from student
|""".stripMargin
ps = connection.prepareStatement(sql)
}
//每条数据执行一次
override def run(ctx: SourceFunction.SourceContext[Student]): Unit = {
val queryResultSet: ResultSet = ps.executeQuery()
while (queryResultSet.next()) {
val stuId: Int = queryResultSet.getInt("id")
val stuName: String = queryResultSet.getString("name")
val stuAddr: String = queryResultSet.getString("addr")
val stuSex: String = queryResultSet.getString("sex")
val student: Student = Student(stuId, stuName, stuAddr, stuSex)
ctx.collect(student)
}
}
override def cancel(): Unit = {
}
}
}
3.DataStream 的 Transformation
3.1 KeyBy
逻辑上将一个流分成不相交的分区, 每个分区包含相同键的元素
package com.czxy.flink.stream.transformation
import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala.{DataStream, KeyedStream, StreamExecutionEnvironment}
//keyBy分组操作算子
object StreamKeyBy {
def main(args: Array[String]): Unit = {
//1.创建执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//2.构建数据集
import org.apache.flink.api.scala._
val elementSource: DataStream[String] = env.fromElements("hadoop hadoop spark hive flink flink")
//3.数据组合成元祖类型
val wordAndOne: DataStream[(String, Int)] = elementSource.flatMap(x=>x.split(" ")).map((_,1))
//4.进行分组
val KeyedStream: KeyedStream[(String, Int), Tuple] = wordAndOne.keyBy(0)
//5.聚合计算
val result: DataStream[(String, Int)] = KeyedStream.reduce((v1,v2)=>(v1._1,v1._2+v2._2))
//6.打印输出
result.print().setParallelism(1)
//7.执行程序
env.execute("StreamKeyBy")
}
}
3.2 Connect
用来将两个 dataStream 组装成一个 ConnectedStreams
package com.czxy.flink.stream.transformation
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala.{ConnectedStreams, DataStream, StreamExecutionEnvironment}
object StreamConnect {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
val source1: DataStream[Long] = env.addSource(new NoParallelSource()).setParallelism(1)
val source2: DataStream[Long] = env.addSource(new NoParallelSource()).setParallelism(1)
val connectedStreams: ConnectedStreams[Long, Long] = source1.connect(source2)
val result: DataStream[String] = connectedStreams.map(item1 => {
"item1: " + item1
},
item2 => {
"item2: " + item2
})
result.print()
env.execute("StreamConnect")
}
//实现一个单线程的,数据从1开始递增的数据集
class NoParallelSource extends SourceFunction[Long]() {
var number: Long = 1L
var isRunning: Boolean = true
override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
while (isRunning) {
ctx.collect(number)
number += 1
Thread.sleep(1)
if (number > 5) {
cancel()
}
}
}
override def cancel(): Unit = {
isRunning = false
}
}
}
3.3 Split 和 select
Split 就是将一个 DataStream 分成两个或者多个 DataStream;Select 就是获取分流后对应的数据
package com.czxy.flink.stream.transformation
import org.apache.flink.streaming.api.scala.{DataStream, SplitStream, StreamExecutionEnvironment}
/**
* 需求:
* 给出数据 1, 2, 3, 4, 5, 6, 7
* 请使用 split 和 select 把数据中的奇偶数分开, 并打印出奇数
*/
object StreamSplit {
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
val source: DataStream[Int] = env.fromElements(1, 2, 3, 4, 5, 6, 7)
val splitStream: SplitStream[Int] = source.split(x => {
(x % 2) match {
case 0 => List("偶数")
case 1 => List("奇数")
}
})
val result: DataStream[Int] = splitStream.select("奇数")
result.print()
env.execute("StreamSplit")
}
}
4.DataStream数据输出sink(掌握)
4.1 将数据 sink 到本地文件(参考批处理)
4.2 Sink 到本地集合(参考批处理)
4.3 Sink 到 HDFS(参考批处理)
4.4sink 到 kafka
package com.czxy.flink.stream.sink
import java.util.Properties
import org.apache.flink.streaming.api.datastream.DataStreamSink
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer011
import org.apache.flink.streaming.util.serialization.SimpleStringSchema
object StreamKafkaSink {
def main(args: Array[String]): Unit = {
//1.创建执行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//2.构建数据集
import org.apache.flink.api.scala._
val source: DataStream[String] = env.fromElements("1,小丽,北京,女")
//3.设置kafka的配置信息
val topic="test"
val properties: Properties = new Properties()
properties.setProperty("bootstrap.servers","node01:9092")
val flinkKafkaProducer: FlinkKafkaProducer011[String] = new FlinkKafkaProducer011[String](topic,new SimpleStringSchema(),properties)
val result: DataStreamSink[String] = source.addSink(flinkKafkaProducer)
env.execute("StreamKafkaSink")
}
}
4.5sink 到 mysql
package com.czxy.flink.stream.sink
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.datastream.DataStreamSink
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
object StreamMysqlSink {
case class Student(stuId: Int, stuName: String, stuAddr: String, stuSex: String)
def main(args: Array[String]): Unit = {
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
import org.apache.flink.api.scala._
val source: DataStream[Student] = env.fromElements(
Student(9, "wangman", "beijing", "nv")
)
val result: DataStreamSink[Student] = source.addSink(new MysqlSink())
env.execute("StreamMysqlSink")
}
class MysqlSink extends RichSinkFunction[Student]() {
var connection: Connection = null
var ps: PreparedStatement = null
override def open(parameters: Configuration): Unit = {
val driver = "com.mysql.jdbc.Driver"
val url = "jdbc:mysql://localhost:3306/test?characterEncoding=utf-8&useSSL=false"
val username = "root"
val password = "root"
Class.forName(driver)
connection = DriverManager.getConnection(url, username, password)
val sql =
"""
|insert into student(id , name , addr , sex)values(?,?,?,?);
|""".stripMargin
ps = connection.prepareStatement(sql)
}
//每条数据执行一次
override def invoke(value: Student): Unit = {
try{
ps.setInt(1, value.stuId)
ps.setString(2, value.stuName)
ps.setString(3, value.stuAddr)
ps.setString(4, value.stuSex)
ps.executeUpdate()
}catch{
case e:Exception=>println(e.getMessage)
}
}
}
}
呀嘞牙嘞,下回再说