flink(scala版)学习一之常用的source

source之文件数据读取

import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
//导入隐式转换,建议写在这里,可以防止IDEA代码提示出错的问题
import org.apache.flink.streaming.api.scala._

object FileReadeSource {
  def main(args: Array[String]): Unit = {
    //初始化Flink的Streaming(流计算)上下文执行环境
    val streamEnv = StreamExecutionEnvironment.getExecutionEnvironment
    streamEnv.setParallelism(1)

    //读取数据
    val dataStream =streamEnv.readTextFile("./data/trafficdata")
    val ds = dataStream.map(line=>{
      val arr = line.split("\t")
      val info = s"${arr(0)}\t${arr(1)}\t${arr(2)}\t${arr(3)}\t${arr(4)}\t${arr(5)}\t${arr(6)}"
      info
    })

    ds.print()
    streamEnv.execute("flink start")
  }
}

数据结构如下
在这里插入图片描述
最终结果:
在这里插入图片描述

source之kafka数据读取

普通的string数据

import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.flink.streaming.api.scala._
import java.util.Properties


object KafkaSourceWithoutKey {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment

    //组织配置项
    val props = new Properties()
    props.setProperty("bootstrap.servers", "172.16.254.4:9092,172.16.254.5:9092,172.16.254.6:9092")
//    props.setProperty("bootstrap.servers", "node1:9092,node2:9092,node3:9092")
    props.setProperty("key.deserializer", classOf[StringDeserializer].getName)
    props.setProperty("value.deserializer", classOf[StringDeserializer].getName)
    props.setProperty("group.id", "test01_group")
//    props.setProperty("auto.offset.reset", "latest") //也可以不设置,默认是 flinkKafkaConsumer.setStartFromGroupOffsets(),设置了也不会起作用
    //读取Kafka中的数据
    val lines: DataStream[String] = env.addSource(new FlinkKafkaConsumer[String]("test01", new SimpleStringSchema(), props))
    lines.print()
    //触发执行
    env.execute()
  }
}

前往kafka集群进行测试:
在这里插入图片描述
结果打印:
在这里插入图片描述

标准的key-value数据

import org.apache.flink.api.common.typeinfo.TypeInformation
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment, createTuple2TypeInformation}
import org.apache.flink.streaming.connectors.kafka.{FlinkKafkaConsumer, KafkaDeserializationSchema}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.flink.streaming.api.scala._
import java.util.Properties

object KafkaSourceWithKey {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment

    val props = new Properties()
    props.setProperty("bootstrap.servers", "172.16.254.4:9092,172.16.254.5:9092,172.16.254.6:9092")
    props.setProperty("key.serializer", classOf[StringDeserializer].getName)
    props.setProperty("value.serializer", classOf[StringDeserializer].getName)
    props.setProperty("group.id", "test02_group")
    //    props.setProperty("auto.offset.reset","latest") 设置不设置无所谓,因为可以对FlinkKafkaConsumer设置 从什么位置读取数据

    val flinkKafkaConsumer = new FlinkKafkaConsumer[(String, String)]("test02", new KafkaDeserializationSchema[(String, String)] {
      override def isEndOfStream(t: (String, String)): Boolean = false //是否流结束

      override def deserialize(consumerRecord: ConsumerRecord[Array[Byte], Array[Byte]]): (String, String) = {
        var key = "0"
        var value = "null"
        if (consumerRecord.key() != null) {
          key = new String(consumerRecord.key(), "UTF-8")
        }
        if (consumerRecord.value() != null) {
          value = new String(consumerRecord.value, "UTF-8")
        }
        (key, value)
      }

      //设置返回的二元组类型 ,createTuple2TypeInformation 需要导入隐式转换
      override def getProducedType: TypeInformation[(String, String)] = {
        createTuple2TypeInformation(createTypeInformation[String], createTypeInformation[String])
      }
    }, props)

    //设置读取Kafka中的数据从最后开始,默认设置为 setStartFromGroupOffsets
    val infos: DataStream[(String, String)] = env.addSource(flinkKafkaConsumer.setStartFromLatest())
    //打印结果
    infos.print()
    //触发执行
    env.execute()
  }
}

结果:
在这里插入图片描述

source之集合数据读取

import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
//导入隐式转换,建议写在这里,可以防止IDEA代码提示出错的问题
import org.apache.flink.streaming.api.scala._

case class StationLog(sid: String, callOut: String, callIn: String, callType: String, callTime: Long, duration: Long)

object CollectionSource {
  def main(args: Array[String]): Unit = {
    //初始化Flink的Streaming(流计算)上下文执行环境
    val streamEnv = StreamExecutionEnvironment.getExecutionEnvironment
    streamEnv.setParallelism(1)

    //读取数据
    var dataStream = streamEnv.fromCollection(Array(
      new StationLog("001", "186", "189", "busy", 1577071519462L, 0),
      new StationLog("002", "186", "188", "busy", 1577071520462L, 0),
      new StationLog("003", "183", "188", "busy", 1577071521462L, 0),
      new StationLog("004", "186", "188", "success", 1577071522462L, 32)
    ))

    dataStream.print()
    streamEnv.execute()
  }
}

结果:
在这里插入图片描述

source之sockets数据读取

import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment}
import org.apache.flink.streaming.api.scala._

object SocketSource {
  def main(args: Array[String]): Unit = {
    val env = StreamExecutionEnvironment.getExecutionEnvironment

    // get input data
    val text: DataStream[String] = env.socketTextStream("127.0.0.1", 6666)
    val counts = text.flatMap(_.toLowerCase().split("\\W+"))
      .map((_, 1)).keyBy(0).sum(1)

    counts.print()
    env.execute("Streaming Count")
  }
}

结果展示:
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值