阿里云日志服务 + Flume + Kafka + Spark Streaming--part three

重点终于来了!!!

我们不用着急,一步一步来迭代

前两篇文章分别介绍了Flume把阿里云日志服务的日志采集过来,然后流到Kafka,也就是说,目前为止,我们已经成功打通了阿里云日

志服务 --> Flume --> Kafka这条链路。我们接着开干。

首先我们先看一下Spark Streaming的代码

package com.ruozedata.spark.sls

import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent

object SlsETLApp {
  def main(args: Array[String]): Unit = {

    val brokers = "ip:9092"
    val topic = "sls"
    val groupid = "consume_logs_from_sls"

    val sc = new SparkConf()
      .setAppName("SlsETLApp")
      .setMaster("local[2]")

    val ssc = new StreamingContext(sc, Seconds(10))


    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupid,
      "auto.offset.reset" -> "latest", //earliest
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    val topics = topic.split(",")
    val records = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )
    records.map(_.value).print()

    ssc.start()
    ssc.awaitTermination()

  }
}

上面的代码没干什么事情,就是Spark Streaming把Kafka的数据取出来并打印到控制台上

咱先看看效果

在这之前,我又启动了一个Kafka消费者,通过控制台我们也可以看到上图跟下图的数据是吻合的

接下来,我们继续对代码进行迭代升级

package com.ruozedata.spark.sls

import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{DateType, StringType, StructField, StructType}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent

object SlsETLApp {
  def main(args: Array[String]): Unit = {

    val brokers = "ip:9092"
    val topic = "sls"
    val groupId = "consume_logs_from_sls"

    val sc = new SparkConf()
      .setAppName("SlsETLApp")
      .setMaster("local[2]")

    val ssc = new StreamingContext(sc, Seconds(10))


    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupId,
      "auto.offset.reset" -> "latest", //earliest
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    val topics = topic.split(",")
    val records = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )
    
    records.map(record => record.value()).foreachRDD(rdd => {
      val spark = SparkSession.builder().config(rdd.sparkContext.getConf).getOrCreate()
      import spark.implicits._
      val df = spark.read.json(spark.createDataset(rdd))
      import org.apache.spark.sql.functions._
      df.select(
        split(df("__tag__:__path__"), "/")(2).as("topic"),
        split(df("__tag__:__path__"), "/")(4).as("log_date"),
        df("online").as("online_number")
      ).show(false)


    })

    ssc.start()
    ssc.awaitTermination()

  }
}

上面的代码又干嘛了呢,其实就是Spark Streaming从Kafka topic为sls消费json数据,在foreachRDD里面,把读到的数据转为dataframe,转为Spark SQL有很多种方式,可以查看这篇博客https://blog.csdn.net/shirukai/article/details/85211951

运行代码,呵,报错了

19/07/10 18:27:34 ERROR JobScheduler: Error running job streaming job 1562754450000 ms.0
org.apache.spark.sql.AnalysisException: Cannot resolve column name "__tag__:__path__" among ();
	at org.apache.spark.sql.Dataset$$anonfun$resolve$1.apply(Dataset.scala:222)
	at org.apache.spark.sql.Dataset$$anonfun$resolve$1.apply(Dataset.scala:222)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.sql.Dataset.resolve(Dataset.scala:221)
	at org.apache.spark.sql.Dataset.col(Dataset.scala:1241)
	at org.apache.spark.sql.Dataset.apply(Dataset.scala:1208)
	at com.ruozedata.spark.sls.SlsETLApp$$anonfun$main$2.apply(SlsETLApp.scala:48)
	at com.ruozedata.spark.sls.SlsETLApp$$anonfun$main$2.apply(SlsETLApp.scala:42)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Exception in thread "main" org.apache.spark.sql.AnalysisException: Cannot resolve column name "__tag__:__path__" among ();
	at org.apache.spark.sql.Dataset$$anonfun$resolve$1.apply(Dataset.scala:222)
	at org.apache.spark.sql.Dataset$$anonfun$resolve$1.apply(Dataset.scala:222)
	at scala.Option.getOrElse(Option.scala:121)
	at org.apache.spark.sql.Dataset.resolve(Dataset.scala:221)
	at org.apache.spark.sql.Dataset.col(Dataset.scala:1241)
	at org.apache.spark.sql.Dataset.apply(Dataset.scala:1208)
	at com.ruozedata.spark.sls.SlsETLApp$$anonfun$main$2.apply(SlsETLApp.scala:48)
	at com.ruozedata.spark.sls.SlsETLApp$$anonfun$main$2.apply(SlsETLApp.scala:42)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1$$anonfun$apply$mcV$sp$3.apply(DStream.scala:628)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
19/07/10 18:27:34 INFO StreamingContext: Invoking stop(stopGracefully=false) from shutdown hook
.....

咋回事呢,百思不得其解,其实嘛,我们忽略了一点,就是有可能有批次是没有数据的,因为DataFrame是Spark SQL根据JSON的schema推断然后生成,如果这个批次没有数据的话,schema就没有推断出来啊,下面select代码中的字段就是不存在的。好,知道问题出现在哪之后,我们继续修改代码,如下

package com.ruozedata.spark.sls

import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.{DateType, StringType, StructField, StructType}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, KafkaUtils}
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent

object SlsETLApp {
  def main(args: Array[String]): Unit = {

    val brokers = "ip:9092"
    val topic = "sls"
    val groupId = "consume_logs_from_sls"

    val sc = new SparkConf()
      .setAppName("SlsETLApp")
      .setMaster("local[2]")

    val ssc = new StreamingContext(sc, Seconds(10))


    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupId,
      "auto.offset.reset" -> "latest", //earliest
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    val topics = topic.split(",")
    val records = KafkaUtils.createDirectStream[String, String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )


    records.map(record => record.value()).foreachRDD(rdd => {
      val spark = SparkSession.builder().config(rdd.sparkContext.getConf).getOrCreate()

      if (rdd.isEmpty) {
        println("No logs in this time interval")
        return
      } else {
        import spark.implicits._
        val df = spark.read.json(spark.createDataset(rdd))
        import org.apache.spark.sql.functions._
        df.select(
          split(df("__tag__:__path__"), "/")(2).as("topic"),
          split(df("__tag__:__path__"), "/")(4).as("log_date"),
          df("online").as("online_number")
        ).show(false)
      }

    })


    ssc.start()
    ssc.awaitTermination()

  }
}

继续运行代码,呵,仍然报错,遇到错误,不要着急,看看问题在哪

19/07/10 18:36:35 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, 192.168.1.3, 55920, None)
19/07/10 18:36:36 WARN KafkaUtils: overriding enable.auto.commit to false for executor
19/07/10 18:36:36 WARN KafkaUtils: overriding auto.offset.reset to none for executor
19/07/10 18:36:36 WARN KafkaUtils: overriding executor group.id to spark-executor-consume_logs_from_sls
19/07/10 18:36:36 WARN KafkaUtils: overriding receive.buffer.bytes to 65536 see KAFKA-3135
Exception in thread "main" org.apache.spark.util.ReturnStatementInClosureException: Return statements aren't allowed in Spark closures
	at org.apache.spark.util.ReturnStatementFinder$$anon$1.visitTypeInsn(ClosureCleaner.scala:377)
	at org.apache.xbean.asm5.ClassReader.a(Unknown Source)
	at org.apache.xbean.asm5.ClassReader.b(Unknown Source)
	at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)
	at org.apache.xbean.asm5.ClassReader.accept(Unknown Source)
	at org.apache.spark.util.ClosureCleaner$.org$apache$spark$util$ClosureCleaner$$clean(ClosureCleaner.scala:248)
	at org.apache.spark.util.ClosureCleaner$.clean(ClosureCleaner.scala:159)
	at org.apache.spark.SparkContext.clean(SparkContext.scala:2292)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1.apply$mcV$sp(DStream.scala:627)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1.apply(DStream.scala:626)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$foreachRDD$1.apply(DStream.scala:626)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.SparkContext.withScope(SparkContext.scala:692)
	at org.apache.spark.streaming.StreamingContext.withScope(StreamingContext.scala:265)
	at org.apache.spark.streaming.dstream.DStream.foreachRDD(DStream.scala:626)
	at com.ruozedata.spark.sls.SlsETLApp$.main(SlsETLApp.scala:43)
	at com.ruozedata.spark.sls.SlsETLApp.main(SlsETLApp.scala)
19/07/10 18:36:36 INFO SparkContext: Invoking stop() from shutdown hook

嗯,百度一下吧,https://www.jianshu.com/p/2053634328d3

那就把return去掉吧

继续运行代码

至此,整个项目都跑通了,接下来,我们实现几个需求,敬请期待!!!

 

 

 

 

 

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值