1 Transformation
和dataset一样,dataStream也包括一系列的Transformation操作:
1.1 keyby算子
package com.sjxy.flink.stream.source.transformation
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
/*
演示flink中keyby的用法
实现单词统计
*/
object KeyByDemo {
def main(args: Array[String]): Unit = {
//1 创建一个流处理的运行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// 2 加载socketstream
val socketDs: DataStream[String] = env.socketTextStream("node1", 9999)
//3 对接收到的数据切分压平转成单词,1的元组
val wordAndOneDs: DataStream[(String, Int)] = socketDs.flatMap(_.split(" ")).map(_ -> 1)
// 4 按照单词分组
// wordAndOneDs.keyBy(_._1).sum(1).print()
wordAndOneDs.keyBy(0).sum(1).print()
//5 启动
env.execute()
}
}
1.2 connect算子
package com.sjxy.flink.stream.source.transformation
import java.util.concurrent.TimeUnit
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala.{ConnectedStreams, DataStream, StreamExecutionEnvironment}
import org.apache.flink.api.scala._
/*
演示flink中connect的用法,把两个数据流连接到一起
需求:
创建两个流,一个产生数值,一个产生字符串数据
使用connect连接两个流,结果如何
*/
object ConnectDemo {
def main(args: Array[String]): Unit = {
//1 创建一个流处理的运行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// 2 加载source
val numDs: DataStream[Long] = env.addSource(new MyNumberSource)
val strDs = env.addSource(new MyStrSource)
// 3 使用connect进行两个连接操作
val connectedDs: ConnectedStreams[Long, String] = numDs.connect(strDs)
//传递两个函数,分别处理数据
val resDs: DataStream[String] = connectedDs.map(l=>"long"+l, s=>"string"+s)
//connect意义在哪里呢?只是把两个合并为一个,但是处理业务逻辑都是按照自己的方法处理?connect之后两条流可以共享状态数据
resDs.print()
//5 启动
env.execute()
}
}
//自定义产生递增的数字 第一个数据源
class MyNumberSource extends SourceFunction[Long]{
var flag=true
var num=1L
override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
while(flag){
num +=1
ctx.collect(num)
TimeUnit.SECONDS.sleep(1)
}
}
override def cancel(): Unit = {
flag=false
}
}
// 自定义产生从1开始递增字符串
class MyStrSource extends SourceFunction[String]{
var flag=true
var num=1L
override def run(ctx: SourceFunction.SourceContext[String]): Unit = {
while(flag){
num +=1
ctx.collect("str"+num)
TimeUnit.SECONDS.sleep(1)
}
}
override def cancel(): Unit = {
flag=false
}
}
2 Sink
2.1 Sink到MySQL
1.执行代码前使用datagrip等查看MySQL中使用的表,确认没有待插入的数据;
2.执行代码;
3.查看数据是否已插入。
package com.sjxy.flink.stream.source.sink
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.flink.api.scala._
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.sink.{RichSinkFunction, SinkFunction}
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
/*
flink程序计算结果保存到mysql中
*/
//定义student case class
case class Student1(id: Int, name: String, age: Int)
object SinkToMysqlDemo {
def main(args: Array[String]): Unit = {
/*
读取数据然后直接写入mysql,需要自己实现mysql sinkfunction
自定义class实现RichSinkFunction重写open,invoke,close方法
*/
//1 创建一个流处理的运行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// 2 加载source
val stuDs: DataStream[Student1] = env.fromElements(Student1(0, "tony0422", 18))
// 3 直接写出到mysql
stuDs.addSink(new MySqlSinkFunction)
// 4 执行
env.execute()
}
}
//准备自定义mysql sinkfunciton
class MySqlSinkFunction extends RichSinkFunction[Student1] {
var ps: PreparedStatement = null
var connection: Connection = null
// 3.1 打开连接
override def open(parameters: Configuration): Unit = {
// 3.1.1驱动方式
connection = DriverManager.getConnection("jdbc:mysql://node1:3306/ke", "root", "123456")
//3.1.2准备sql语句插入数据到mysql表中
var sql = "insert into t_student(name,age) values(?,?)";
//3.1.3准备执行语句对象
ps = connection.prepareStatement(sql)
}
//关闭连接
override def close(): Unit = {
if (connection != null) {
connection.close()
}
if (ps != null) ps.close()
}
// 3.2 这个方法负责写入数据到mysql中,value就是上游datastream传入需要写入mysql的数据
override def invoke(value: Student1, context: SinkFunction.Context): Unit = {
// 3.2.1设置参数
ps.setString(1, value.name)
ps.setInt(2, value.age)
//3.2.2执行插入动作
ps.executeUpdate()
}
}
2.2 Sink到Kafka
1.先通过脚本启动Kafka;
2.打开offset工具查看;
3.执行代码。
package com.sjxy.flink.stream.source.sink
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.scala._
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer
import org.apache.flink.streaming.connectors.kafka.internals.KeyedSerializationSchemaWrapper
//import org.apache.flink.streaming.util.serialization.KeyedSerializationSchemaWrapper
import org.apache.kafka.clients.producer.ProducerConfig
/*
flink程序计算结果保存到kafka
*/
//定义student case class
case class Student(id: Int, name: String, age: Int)
object SinkToKafkaDemo {
def main(args: Array[String]): Unit = {
/*
flink读取数据然后把数据写入kafka中
*/
//1 创建一个流处理的运行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
// 2 加载source
val stuDs: DataStream[Student] = env.fromElements(Student(0, "tony", 18))
// 3 直接使用flinkkafkaproducer来生产数据到kafka
//3.1 准备一个flinkkafkaproducer对象
//写入kafka的数据类型
//param1
var topic="test"
//param2
val keyedSerializationWrapper: KeyedSerializationSchemaWrapper[String] =
new KeyedSerializationSchemaWrapper(new SimpleStringSchema())
//param3
val prop = new Properties()
prop.setProperty(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG,"node1:9092,node2:9092")
val flinkKafkaProducer: FlinkKafkaProducer[String] = new FlinkKafkaProducer[String](
topic,keyedSerializationWrapper,prop)
// 4 sink 操作
stuDs.map(_.toString).addSink(flinkKafkaProducer)
// 5 执行
env.execute()
}
}
说明:Kafka一键启动有问题,参考Kafka简介与基本使用(Appendix Ⅰ)的2.1节中的另外部分。