【Flink】流式处理--DataStream API 开发

一、使用socket统计单词个数

前提:安装nc服务

yum install -y nc

import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala.{DataStream, KeyedStream, StreamExecutionEnvironment, WindowedStream}
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow

object StreamWordCount {
  def main(args: Array[String]): Unit = {
    //1、获取流处理运行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1)
    import org.apache.flink.api.scala._
    //2、构建socket流数据源,并指定IP地址和端口号
    val testDataStream: DataStream[String] = env.socketTextStream("node01",7777)
    //3、对接收到的数据穿换成单词元祖
    val wordDataStream: DataStream[(String, Int)] = testDataStream.flatMap(_.split(" ")).map(_ -> 1)
    //4、使用keyBy进行分流(分组)
    //在批处理中针对于dataset, 如果分组需要使用groupby
    //在流处理中针对于datastream, 如果分组(分流)使用keyBy
    val groupedDataStream: KeyedStream[(String, Int), Tuple] = wordDataStream.keyBy(0)
    //5、使用timeWindow指定窗口长度(每5秒计算一次)
    //spark-》reduceBykeyAndWindow val windowDataStream: Windowed
    val windowDataStream: WindowedStream[(String, Int), Tuple, TimeWindow] = groupedDataStream.timeWindow(Time.seconds(5))
    //6、使用sum执行累加
    val sumDataStream: DataStream[(String, Int)] = windowDataStream.sum(1)
    sumDataStream.print()
    env.execute()
  }
}

开启nc服务输入数据

执行代码

二、输入数据集 Data Sources

Flink 中你可以使用 StreamExecutionEnvironment.addSource(source) 来为你的程序添
加数据来源。
Flink 已 经 提 供 了 若 干 实 现 好 了 的 source functions ,当 然 你 也 可 以
通 过 实 现 SourceFunction 来自定义非并行的 source 或者实现 ParallelSourceFunction
接口或者扩展 RichParallelSourceFunction 来自定义并行的 source。

1、Flink 在流处理上常见的 Source

Flink 在流处理上常见的 Source Flink 在流处理上的 source 和在批处理上的 source
基本一致。
大致有 4 大类
基于本地集合的 source (Collection-based-source)
基于文件的 source (File-based-source)- 读取文本文件,即符合 TextInputFormat 规
范 的文件,并将其作为字符串返回
基于网络套接字的 source (Socket-based-source)- 从 socket 读取。元素可以用分隔符
切分。
自定义的 source (Custom-source)

2、基于集合的 source

import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import scala.collection.immutable.{Queue, Stack}
import scala.collection.mutable
import scala.collection.mutable.{ArrayBuffer, ListBuffer}
import org.apache.flink.api.scala._
object StreamDataSourceDemo {
  def main(args: Array[String]): Unit = {
    val senv = StreamExecutionEnvironment.getExecutionEnvironment
    //0.用Element创建DataStream(fromElements)
    val ds0: DataStream[String] = senv.fromElements("spark","flink")
    ds0.print()
    //1.用Tuple创建DataStream(fromElements)
    val ds1: DataStream[(Int, String)] = senv.fromElements((1,"spark"),(2,"flink"))
    ds1.print()
    //2、用Array创建DataStream
    val ds2: DataStream[String] = senv.fromCollection(Array("spark","flink"))
    ds2.print()
    //3、用ArrayBuffer创建DataStream
    val ds3: DataStream[String] = senv.fromCollection(ArrayBuffer("spark","flink"))
    ds3.print()
    //4.用List创建DataStream
    val ds4: DataStream[String] = senv.fromCollection(List("spark", "flink"))
    ds4.print()
    //5.用List创建DataStream
    val ds5: DataStream[String] = senv.fromCollection(ListBuffer("spark", "flink"))
    ds5.print()
    //6.用Vector创建DataStream
    val ds6: DataStream[String] = senv.fromCollection(Vector("spark", "flink"))
    ds6.print()
    //7.用Queue创建DataStream
    val ds7: DataStream[String] = senv.fromCollection(Queue("spark", "flink"))
    ds7.print()
    //8.用Stack创建DataStream
    val ds8: DataStream[String] = senv.fromCollection(Stack("spark", "flink"))
    ds8.print()
    //9.用Stream创建DataStream(Stream相当于lazy List,避免在中间过程中生 成不必要的集合)
    val ds9: DataStream[String] = senv.fromCollection(Stream("spark", "flink"))
    ds9.print()
    //10.用Seq创建DataStream
    val ds10: DataStream[String] = senv.fromCollection(Seq("spark", "flink"))
    ds10.print()
    //11.用Set创建DataStream(不支持)
    //val ds11: DataStream[String] = senv.fromCollection(Set("spark", "flink"))
    //ds11.print()
  }
}

3、基于文件的 source(File-based-source)

import org.apache.flink.streaming.api.datastream.DataStreamSource
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment

object StreamFileSourceDemo {
  def main(args: Array[String]): Unit = {
    //1.构建流处理的环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    // 2.基于文件的source,构建数据集
    val textDStream: DataStreamSource[String] = env.readTextFile("E:\\资料\\第二学年第二学期\\flink\\day02资料\\测试数据源\\wordcount.txt")
    //3.打印输出
    textDStream.print()
    // 4.执行程序
    env.execute("StreamFileSourceDemo")
  }
}

4、基于网络套接字的 source(Socket-based-source)

val source = env.socketTextStream("IP", PORT)

5、自定义的 source(Custom-source)

(1)SourceFunction:创建非并行数据源。

import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala._

object StreamCustomerNoParallelSourceDemo {
  def main(args: Array[String]): Unit = {
    //1.创建执行环境
    val senv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.构建数据源
    val NoParllelDataStream: DataStream[Long] = senv.addSource(new NoParallelSource()).setParallelism(1)
    //3.打印输出
    NoParllelDataStream.print()
    //4.执行程序
    senv.execute("StreamCustomerNoParallelSourceDemo")
  }

  //创建一个并行度为1的数据源
  //实现从1开始产生递增数字
  class NoParallelSource extends SourceFunction[Long]() {
    //声明一个Long类型的变量
    var number: Long = 1L
    //声明一个初始化为true的Boolean变量
    var isrunning: Boolean = true

    override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
      while (isrunning) {
        ctx.collect(number)
        Thread.sleep(1000)
        number += 1
        if (number > 5) {
          isrunning = false
          cancel()
        }
      }
    }

    override def cancel(): Unit = {
      isrunning = false
    }
  }

}

(2)ParallelSourceFunction:创建并行数据源。

import org.apache.flink.streaming.api.functions.source.{ParallelSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala._

object StreamCustomerParallelSourceDemo {
  def main(args: Array[String]): Unit = {
    //1.创建流处理的执行环境
    val senv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.基于自定义ParallelSource数据源创建并行的数据
    val parallelSource: DataStream[Long] = senv.addSource(new MyParallelSource()).setParallelism(2)
    //3.打印输出
    parallelSource.print()
    //4.执行程序
    senv.execute("StreamCustomerParallelSourceDemo")
  }

  /**
   * 创建一个并行度为1的数据源
   * 实现从1开始产生递增数字
   */
  class MyParallelSource extends ParallelSourceFunction[Long]() {
    //声明一个Long类型的变量
    var number: Long = 1L
    //声明一个初始化为true的Boolean变量
    var isrunning: Boolean = true

    override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
      while (isrunning) {
        ctx.collect(number)
        number += 1
        if (number > 20) {
          cancel()
        }
      }
    }

    override def cancel(): Unit = {
      isrunning = false
    }
  }

}

(3)RichParallelSourceFunction:创建并行数据源。

import org.apache.flink.streaming.api.functions.source.{RichParallelSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala._

object StreamCustomerRichParallelSourceDemo {
  def main(args: Array[String]): Unit = {
    //1.创建流处理运行环境
    val senv: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2.基于RichParallelSource并行数据源构建数据集
    val richParallelSource: DataStream[Long] = senv.addSource(new RichParallelSource()).setParallelism(2)
    richParallelSource.map(line => {
      print("接收到的数据:" + line)
      line
    })
  }

  class RichParallelSource extends RichParallelSourceFunction[Long] {
    var number: Long = 1L
    var isRunning: Boolean = true

    override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
      while (isRunning) {
        ctx.collect(number)
        number += 1
        Thread.sleep(1)
        if (number > 5) {
          cancel()
        }
      }
    }

    override def cancel(): Unit = {
      isRunning = false
    }
  }

}

6、基于 kafka 的 source 操作

kafka基本操作点此跳转

import java.util.Properties

import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011

object StreamKafkaSourceDemo {
  def main(args: Array[String]): Unit = {
    //构建流处理执行环境环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //指定消费者
    var topic = "test";
    //设置参数
    val props = new Properties
    props.setProperty("bootstrap.servers", "node01:9092")
    props.setProperty("group.id", "test01")
    props.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
    props.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
    //基于flink创建kafka消费者
    val kafkaConsumer: FlinkKafkaConsumer011[String] = new FlinkKafkaConsumer011[String](topic, new SimpleStringSchema(), props)
    //flink从topic中最新的数据开始消费
    kafkaConsumer.setStartFromLatest();
    //构建基于kafka的数据源
    val kafkaDataStream: DataStream[String] = env.addSource(kafkaConsumer)
    //打印输出消费的数据
    kafkaDataStream.print()
    //执行流处理程序
    env.execute("StreamKafkaSourceDemo")
  }
}

7、基于 mysql 的 source 操作

import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}

import org.apache.flink.api.scala._
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}

object StreamFromMysqlSource {
  def main(args: Array[String]): Unit = {
    //1、创建流处理执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2、添加自定义的mysql数据源对象
    val studentDataStream: DataStream[Student] = env.addSource(new MysqlSource())
    studentDataStream.print()
    env.execute("StreamFromMysqlSource")
  }
  //3、创建mysql的自定义数据源对象
  class MysqlSource extends RichSourceFunction[Student](){
    //3.1、声明Connection对象
    var connection: Connection= null;
    //3.2、声明PreparedStatement对象
    var ps: PreparedStatement = null;

    //在open方法中进行配置链接信息drive url username password
    //加载驱动Class.forName(),DriverManager获取链接,调用prepareStatement,预编译执行SQL
    override def open(parameters: Configuration): Unit = {
      val driver = "com.mysql.jdbc.Driver"
      val url = "jdbc:mysql://localhost:3306/test"
      val username = "root"
      val password = "root"
      Class.forName(driver)
      connection = DriverManager.getConnection(url, username, password)
      var sql =
        """
          |select id,name,addr,sex
          |from student
          |""".stripMargin
      ps = connection.prepareStatement(sql)
    }
    //在run方法中进行查询,结果封装成样例类,ctx进行collect
    override def run(ctx: SourceFunction.SourceContext[Student]): Unit = {
      val queryResultSet: ResultSet = ps.executeQuery()
      while (queryResultSet.next()){
        val id: Int = queryResultSet.getInt("id")
        val name: String = queryResultSet.getString("name")
        val addr: String = queryResultSet.getString("addr")
        val sex: String = queryResultSet.getString("sex")
        val student: Student = Student(id, name, addr, sex)
        ctx.collect(student)
      }
    }

    override def close(): Unit = {
      if (connection!=null){
        connection.close()
      }
      if (ps!=null){
        ps.close()
      }
    }
    override def cancel(): Unit = {
    }
  }
  case class Student(id: Int, name: String, addr: String, sex: String){
    override def toString: String = {
      "用户id:"+id+" 用户名:"+name+" 用户地址:"+addr+" 用户性别:"+sex
    }
  }
}

三、DataStream 的 Transformation

1、KeyBy

逻辑上将一个流分成不相交的分区,每个分区包含相同键的元素。在内部,这是通过散
列分区来实现的
import org.apache.flink.api.java.tuple.Tuple
import org.apache.flink.streaming.api.scala.{DataStream, KeyedStream, StreamExecutionEnvironment}
import org.apache.flink.api.scala._

object StreamKeyBy {
  def main(args: Array[String]): Unit = {
    //获取流处理的执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //获取数据源
    val socketStream: DataStream[String] = env.socketTextStream("node01",9999);
    //keyBy分组
    val groupStream: KeyedStream[(String, Int), Tuple] = socketStream.flatMap(_.split(" ")).map((_, 1)).keyBy(0)
    //聚合计算
    val result: DataStream[(String, Int)] = groupStream.sum(1)
    //打印输出
    result.print()
    //执行程序
    env.execute("StreamKeyBy")
  }
}

2、Connect

用来将两个 dataStream 组装成一个 ConnectedStreams
而且这个 connectedStream 的组成结构就是保留原有的 dataStream 的结构体;这样我们
就可以把不同的数据组装成同一个结构
import org.apache.flink.streaming.api.functions.source.SourceFunction
import org.apache.flink.streaming.api.scala._

object StreamConnectDemo {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    val source1: DataStream[Long] = env.addSource(new MyNoParallelSource)
    val source2: DataStream[Long] = env.addSource(new MyNoParallelSource)
    val connectStreams: ConnectedStreams[Long, Long] = source1.connect(source2)
    val result: DataStream[String] = connectStreams.map(
      function1 => {
        "function1 = " + function1
      },
      function2 => {
        "function2 = " + function2
      }
    )
    result.print()
    env.execute("StreamConnectDemo")
  }

  /**
   * 创建自定义并行度为1的source
   * 实现从1开始产生递增数字
   */
  class MyNoParallelSource extends SourceFunction[Long] {
    var count = 1L
    var isRunning = true

    override def run(ctx: SourceFunction.SourceContext[Long]): Unit = {
      while (isRunning) {
        ctx.collect(count)
        count += 1
        Thread.sleep(1000)
        if (count > 5) {
          cancel()
        }
      }
    }

    override def cancel(): Unit = {
      isRunning = false
    }
  }

}

3、Split 和 select

Split 就是将一个 DataStream 分成两个或者多个 DataStream
Select 就是获取分流后对应的数据
import org.apache.flink.streaming.api.scala._

/**
 * 给出数据 1, 2, 3, 4, 5, 6, 7
 * 请使用 split 和 select 把数据中的奇偶数分开,并打印出奇数
 */
object StreamSplit {
  def main(args: Array[String]): Unit = {
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    val source: DataStream[Int] = env.fromElements(1, 2, 3, 4, 5, 6, 7)
    val splitStream: SplitStream[Int] = source.split(x => {
      (x % 2) match {
        case 0 => List("偶数")
        case 1 => List("奇数")
      }
    })
    val selectDataStream: DataStream[Int] = splitStream.select("奇数")
    selectDataStream.print()
    env.execute("StreamSplit")
  }

}

四、数据输出 Data Sinks

将数据 sink 到本地文件(参考批处理)
Sink 到本地集合(参考批处理)
Sink 到 HDFS(参考批处理)
批处理点此跳转

1、sink 到 kafka

import java.util.Properties

import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.api.scala._
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer011

object StreamKafkaSourceDemo {
  def main(args: Array[String]): Unit = {
    //构建流处理执行环境环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //指定消费者
    var topic = "test";
    //设置参数
    val props = new Properties
    props.setProperty("bootstrap.servers", "node01:9092")
    props.setProperty("group.id", "test01")
    props.setProperty("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
    props.setProperty("value.deserializer", "org.apache.kafka.common.serialization.StringDeserializer")
    //基于flink创建kafka消费者
    val kafkaConsumer: FlinkKafkaConsumer011[String] = new FlinkKafkaConsumer011[String](topic, new SimpleStringSchema(), props)
    //flink从topic中最新的数据开始消费
    kafkaConsumer.setStartFromLatest();
    //构建基于kafka的数据源
    val kafkaDataStream: DataStream[String] = env.addSource(kafkaConsumer)
    //打印输出消费的数据
    kafkaDataStream.print()
    //执行流处理程序
    env.execute("StreamKafkaSourceDemo")
  }
}

2、sink到mysql

import java.sql.{Connection, DriverManager, PreparedStatement, ResultSet}

import org.apache.flink.api.scala._
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.functions.source.{RichSourceFunction, SourceFunction}
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}

object StreamFromMysqlSource {
  def main(args: Array[String]): Unit = {
    //1、创建流处理执行环境
    val env: StreamExecutionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment
    //2、添加自定义的mysql数据源对象
    val studentDataStream: DataStream[Student] = env.addSource(new MysqlSource())
    studentDataStream.print()
    env.execute("StreamFromMysqlSource")
  }
  //3、创建mysql的自定义数据源对象
  class MysqlSource extends RichSourceFunction[Student](){
    //3.1、声明Connection对象
    var connection: Connection= null;
    //3.2、声明PreparedStatement对象
    var ps: PreparedStatement = null;

    //在open方法中进行配置链接信息drive url username password
    //加载驱动Class.forName(),DriverManager获取链接,调用prepareStatement,预编译执行SQL
    override def open(parameters: Configuration): Unit = {
      val driver = "com.mysql.jdbc.Driver"
      val url = "jdbc:mysql://localhost:3306/test"
      val username = "root"
      val password = "root"
      Class.forName(driver)
      connection = DriverManager.getConnection(url, username, password)
      var sql =
        """
          |select id,name,addr,sex
          |from student
          |""".stripMargin
      ps = connection.prepareStatement(sql)
    }
    //在run方法中进行查询,结果封装成样例类,ctx进行collect
    override def run(ctx: SourceFunction.SourceContext[Student]): Unit = {
      val queryResultSet: ResultSet = ps.executeQuery()
      while (queryResultSet.next()){
        val id: Int = queryResultSet.getInt("id")
        val name: String = queryResultSet.getString("name")
        val addr: String = queryResultSet.getString("addr")
        val sex: String = queryResultSet.getString("sex")
        val student: Student = Student(id, name, addr, sex)
        ctx.collect(student)
      }
    }

    override def close(): Unit = {
      if (connection!=null){
        connection.close()
      }
      if (ps!=null){
        ps.close()
      }
    }
    override def cancel(): Unit = {
    }
  }
  case class Student(id: Int, name: String, addr: String, sex: String){
    override def toString: String = {
      "用户id:"+id+" 用户名:"+name+" 用户地址:"+addr+" 用户性别:"+sex
    }
  }
}

 

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值