前几篇文章给大家讲解了个关于Flink批处理相关的技术点,今天给大家将讲解下关于流式处理的DataSource与DataSink ☛(Flink专辑)
一、入门案例
使用Flink的流式处理来计算wordCount
实现步骤:
- 获取Flink批处理运行环境
- 构建一个socket源
- 使用Flink操作进行单词统计
- 打印
说明:如果 linux 上没有安装 nc 服务 ,使用 yum 安装
yum install -y nc
参考代码:
import org.apache.flink.streaming.api.scala._
/**
* @author 流处理wordCount
* @date 2020/8/26 22:03
* @version 1.0
*/
object StreamWordCount{
def main(args: Array[String]): Unit = {
//1.构建流处理的运行环境
val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
//2.使用socket来接收数据
val socketData: DataStream[String] = env.socketTextStream("node01", 9999)
//3.对数据进行切分将每个单词获取出来后面加1 使用keyBy进行分组使用sum进行求核
val result = socketData.flatMap(_.split(" ")).map((_, 1)).keyBy(0).sum(1)
//4.将结果输出并启动
result.print("批处理wordCount")
env.execute("批处理wordCount")
}
}
二、Flink 在流处理上常见的 Source
注意:Flink 在流处理上常见的 Source ,Flink 在流处理上的 source 和在批处理上的 source 基本一致。
2.1 基本地集合的source
我在这就不给大家一一介绍了,我在这里给大家入门,想学习更多关于本地的Data Source请看☛DataSource
import org.apache.flink.streaming.api.scala._
/**
* @author
* @date 2020/9/20 18:57
* @version 1.0
*/
object StreamDataSource {
def main(args: Array[String]): Unit = {
//1.构建流处理运行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.使用fromCollection 构建数据集
val data = env.fromCollection(List("张三", "李四", "王五"))
//3.输出
data.print()
env.execute("StreamDataSource")
}
}
2.2 基本地文件的Source
import org.apache.flink.streaming.api.scala._
/**
* @author
* @date 2020/9/20 19:05
* @version 1.0
*/
object StreamFileSource {
def main(args: Array[String]): Unit = {
//1.构建流处理运行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.使用文件构建数据集
val dataSource = env.readTextFile("./data/wordcount.txt")
//3.打印
dataSource.print()
env.execute("StreamFileSource")
}
}
2.3 自定义Source
除了预定义的 Source 外,我们还可以通过实现 SourceFunction 来自定义 Source,然 后通过 StreamExecutionEnvironment.addSource(sourceFunction)添加进来。 比如读取 Kafka 数据的 Source: addSource(new FlinkKafkaConsumer08<>); 我们可以实现以下三个接口来自定义 Source:
2.3.1 SourceFunction:创建非并行数据源
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala._
/**
* @author
* @date 2020/9/20 19:22
* @version 1.0
*/
object StreamCustomerNoParallelSource {
def main(args: Array[String]): Unit = {
//1.构建流式处理运行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.使用自定义数据流
val dataSource = env.addSource(new MyNoParallel()).setParallelism(1)
//3.打印
dataSource.print()
//4.执行程序
env.execute("StreamCustomerNoParallelSource")
}
class MyNoParallel() extends SourceFunction[Long] {
// 定义一个变量
var number: Long = 1L
var isRunning: Boolean = true
override def run(sourceContext: SourceFunction.SourceContext[Long]): Unit = {
while (isRunning) {
sourceContext.collect(number)
number += 1
Thread.sleep(1000)
if (number == 10) {
cancel()
}
}
}
override def cancel(): Unit = {
isRunning = false
}
}
}
2.3.2 ParallelSourceFunction:创建并行数据源。
import org.apache.flink.streaming.api.functions.source.{ParallelSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala._
/**
* @author
* @date 2020/9/20 20:40
* @version 1.0
*/
object StreamCustomerParallelSource {
def main(args: Array[String]): Unit = {
//1.构建流处理运行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.基于自定义ParallelSource数据源创建并行的数据
val source = env.addSource(new MyParallelSource()).setParallelism(1)
//3.打印输出
source.print()
//4.执行任务
env.execute("StreamCustomerParallelSource")
}
class MyParallelSource() extends ParallelSourceFunction[Long] {
//1.定义一个Long类型的变量
var number: Long = 1L
//2.定义一个变量
var isRunning: Boolean = true
override def run(sourceContext: SourceFunction.SourceContext[Long]): Unit = {
while (isRunning) {
sourceContext.collect(number)
number += 1
if (number > 20) {
cancel()
}
}
}
override def cancel(): Unit = {
isRunning = false
}
}
}
2.3.3 RichParallelSourceFunction:创建并行数据源。
import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala._
/**
* @author
* @date 2020/9/20 20:48
* @version 1.0
*/
object StreamCustomerRichParallelSource {
def main(args: Array[String]): Unit = {
//1.构建流式处理数据集
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.基于RichParallelSource并行数据源构建数据集
val dataSource = env.addSource(new RichParallelSource()).setParallelism(2)
dataSource.map(line=>{
println("接收到的数据:" + line)
line
})
env.execute("StreamCustomerRichParallelSource")
}
class RichParallelSource() extends RichParallelSourceFunction[Long]{
//1.定义一个Long类型的变量
var number: Long = 1L
//2.定义一个变量
var isRunning: Boolean = true
override def run(sourceContext: SourceFunction.SourceContext[Long]): Unit = {
while (isRunning) {
sourceContext.collect(number)
number += 1
Thread.sleep(1000)
if (number > 20) {
cancel()
}
}
}
override def cancel(): Unit = {
isRunning=false
}
}
}
2.3.4 基于 kafka 的 source 操作
在这里我就不过多讲解了关于Kafka的常用的命令,如果想学的可以点击-> kfka常用的操作
代码示例:
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011
/**
* @author 消费kafka中的数据
* @date 2020/9/21 22:53
* @version 1.0
*/
object StreamKafkaSource {
def main(args: Array[String]): Unit = {
//1.构建流处理运行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.指定消费主题
var topic = "FlinkAsKafka"
//2.1设置配置信息
val porps = new Properties()
porps.setProperty("bootstrap.servers", "node01:9092,node02:9092,node03:9092")
porps.setProperty("group.id", "test01")
porps.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
porps.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
//3.基于Flink构建kafka消费者
val kafka = new FlinkKafkaConsumer011[String](topic, new SimpleStringSchema(), porps)
//4.设置Flink层最新的数据开始消费
kafka.setStartFromLatest()
//5.基于kafka构建数据源
val data = env.addSource(kafka)
//6.打印输出
data.print()
env.execute("StreamKafkaSource")
}
}
2.3.5 基于 mysql 的 source 操作
上面就是 Flink 自带的 Kafka source,那么接下来就模仿着写一个从 MySQL 中读取数据 的 Source。
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala._
/**
* @author 基于MySQL的source操作
* @date 2020/9/21 23:17
* @version 1.0
*/
object StreamFromMysqlSource {
case class User(id: String, user_id: String, user_name: String, phone: String, lan_id: String, region_id: String)
def main(args: Array[String]): Unit = {
//1.创建流式执行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.添加自定义mysql数据源
val source = env.addSource(new MySqlSource())
//3.输出
source.print()
//4.任务执行
env.execute("StreamFromMysqlSource")
}
class MySqlSource() extends RichSourceFunction[User] {
//1.声明Connection对象
var connection: Connection = null
//2.声明 PreparedStatement 对象
var ps: PreparedStatement = null
override def open(parameters: Configuration): Unit = {
var driver = "com.mysql.jdbc.Driver"
var url = "jdbc:mysql://node01:3306/datax_web"
var username = "root"
var password = "123456"
Class.forName(driver)
connection = DriverManager.getConnection(url, username, password)
var sql =
"""
|SELECT id,user_id,user_name,phone,lan_id,region_id
|FROM user
|""".stripMargin
ps = connection.prepareStatement(sql)
}
override def run(sourceContext: SourceFunction.SourceContext[User]): Unit = {
val queryResultSet = ps.executeQuery()
while (queryResultSet.next()) {
val id = queryResultSet.getString("id")
val user_id = queryResultSet.getString("user_id")
val user_name = queryResultSet.getString("user_name")
val phone = queryResultSet.getString("phone")
val lan_id = queryResultSet.getString("lan_id")
val region_id = queryResultSet.getString("region_id")
sourceContext.collect(User(id, user_id, user_name, phone, lan_id, region_id))
}
}
override def cancel(): Unit = {
if (connection != null) {
connection.close()
}
if (ps != null) {
ps.close()
}
}
}
}
三、Flink 常用的DataSink
3.1 将数据 sink 到本地文件
import org.apache.flink.streaming.api.scala._
/**
* @author
* @date 2020/9/22 22:41
* @version 1.0
*/
object StreamFileSourceSinkFile {
def main(args: Array[String]): Unit = {
//1.构建流式处理运行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.读取本地文件构建数据集
val data = env.readTextFile("./data/wordcount.txt")
//3.文件输出
data.writeAsText("./data/wordcountSink.txt").setParallelism(1)
//4.开始执行
env.execute("StreamFileSourceSinkFile")
}
}
3.2 Sink 到本地集合
import org.apache.flink.streaming.api.scala._
/**
* @author
* @date 2020/9/22 22:50
* @version 1.0
*/
object StreamFromCollectionSourceFile {
def main(args: Array[String]): Unit = {
//1.构建流处理运行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.使用FromCollection构建数据集
val data = env.fromCollection(List((1, "张三"), (2, "李四"), (1, "赵刘")))
//3.将文件输出
data.writeAsText("./data/fromCollection.txt").setParallelism(1)
//4.执行任务
env.execute("StreamFromCollectionSourceFile")
}
}
3.3 Sink将数据 到 HDFS
import org.apache.flink.streaming.api.scala._
/**
* @author
* @date 2020/9/22 22:41
* @version 1.0
*/
object StreamFileSourceSinkFileHDFS {
def main(args: Array[String]): Unit = {
//1.构建流式处理运行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.读取本地文件构建数据集
val data = env.readTextFile("./data/wordcount.txt")
//3.文件输出
data.writeAsText("hdfs://node01:8020/data/wordcountSink.txt").setParallelism(1)
//4.开始执行
env.execute("StreamFileSourceSinkFile")
}
}
3.4 Sink将数据 到 Kafka
import java.util.Properties
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer011, FlinkKafkaProducer011}
/**
* @author
* @date 2020/9/22 23:17
* @version 1.0
*/
object StreamKafkaSink {
def main(args: Array[String]): Unit = {
//1.构建流处理运行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.构建数据集
val dataSource: DataStream[String] = env.fromElements("1,小丽,北京,女")
//3.构建配置文件
val prop = new Properties()
prop.setProperty("bootstrap.servers", "node01:9092,node02:9092,node03:9092")
//4.连接Kafka
val producer: FlinkKafkaProducer011[String] = new FlinkKafkaProducer011[String]("FlinkAsKafka", new SimpleStringSchema(), prop)
//5.将数据打入kafka
dataSource.addSink(producer)
//6.执行任务
env.execute("StreamKafkaSink")
}
}
3.4 Sink将数据 到 MySQL
import java.sql.{Connection, DriverManager, PreparedStatement}
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.scala._
/**
* @author
* @date 2020/9/22 23:35
* @version 1.0
*/
object StreamMysqlSink {
case class Student(id: Int, name: String, addr: String, sex: String)
def main(args: Array[String]): Unit = {
//1.构建流式处理运行环境
val env = StreamExecutionEnvironment.getExecutionEnvironment
//2.数据准备
val dataSource: DataStream[Student] = env.fromElements(
Student(1, "张三", "上海", "男"),
Student(2, "李四", "北京", "女"),
Student(3, "王五", "上海", "男"),
Student(4, "赵刘", "广东", "男")
)
dataSource.addSink(new StudentSinkToMysql)
env.execute("StreamMysqlSink")
}
class StudentSinkToMysql extends RichSinkFunction[Student] {
var connection: Connection = null
var ps: PreparedStatement = null
override def open(parameters: Configuration): Unit = {
var driver = "com.mysql.jdbc.Driver"
var url = "jdbc:mysql://node01:3306/text?characterEncoding=utf-8&useSSL=false"
var username = "root"
var password = "123456"
//加载驱动
Class.forName(driver)
//创建连接
connection = DriverManager.getConnection(url,username,password)
ps = connection.prepareStatement("insert into student(id,name,addr,sex) values (?,?,?,?);")
}
override def close(): Unit = {
if (connection != null) {
connection.close()
}
if (ps != null) {
ps.close()
}
}
override def invoke(value: Student): Unit = {
ps.setInt(1,value.id)
ps.setString(2,value.name)
ps.setString(3,value.addr)
ps.setString(4,value.sex)
ps.executeUpdate()
}
}
}