关于kafka数据实时落地至hdfs

关于kafka数据实时落地至hdfs

好久没有写博客了!

关于如何使用spark streaming +kafka工具实现实时数据落地至hdfs目录

import java.time.ZonedDateTime
import java.time.format.DateTimeFormatter

import com.alibaba.fastjson.{JSON, JSONArray}
import com.ipinyou.cdp.common.SparkBase
import com.ipinyou.cdp.util.KafkaUtil
import com.typesafe.config.ConfigFactory
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010.{CanCommitOffsets, HasOffsetRanges, OffsetRange}
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization
import org.slf4j.LoggerFactory

/** 实时信息落地到hdfs中,测试人员测试 */
object RTChangeNotice2Hdfs extends SparkBase {

  val logger = LoggerFactory.getLogger(this.getClass)
  /** 拉取kafka的topic和group */
  private val conf = ConfigFactory.load("rm_cdm.conf")
  /** 这个brokers:表示从此brokers中获取数据 */
  val brokers = conf.getString("kafka_brokers_iframe")

//  /**从kakfa中获取数据并实时落地到hdfs中*/
//  def rt_change_notice(spark: SparkSession, ssc: StreamingContext) = {
//    logger.info("brokers: " + brokers + " group_id: " + conf.getString("idmapping_notice_iframe.group") + " topic: " + conf.getString("idmapping_notice_iframe.topic"))
//    val recordDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtil.createStream(ssc,
//      brokers, conf.getString("idmapping_notice_iframe.group"),
//      conf.getString("idmapping_notice_iframe.topic").split(","))
//    import spark.implicits._
//    recordDStream.foreachRDD(rdd => {
//      val offRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
//      rdd.map(_.value()).map(line => {
//        /** 得到消息所带的时间戳 */
//        val message_time = JSON.parseObject(line).getString("message_timestamp")
//        /** 得到消息所带的消息唯一id */
//        val msgid = JSON.parseObject(line).getString("message_id")
//        val data: JSONArray = JSON.parseObject(line).getJSONArray("data")
//        val rest_json = for (i <- 0 until data.size()) yield {
//          val opType = data.getJSONObject(i).getString("op_type")
//          val cdmid = data.getJSONObject(i).getJSONObject("data").getString("cdmid")
//          val result_json: String = get_result(message_time, msgid, opType, cdmid)
//          result_json
//        }
//
//        rest_json
//
//      }).filter(ele =>ele.size > 0).toDF("result").coalesce(1).write.format("parquet")
//        .mode(SaveMode.Append).insertInto("cdp_temp.rt_change_notice")
//
//      recordDStream.asInstanceOf[CanCommitOffsets].commitAsync(offRanges)
//    })
//
//    ssc.start()
//    ssc.awaitTermination()
//  }


  /**从kakfa中获取数据并实时落地到hdfs中*/
  def rt_change_notice(spark: SparkSession, ssc: StreamingContext) = {
    logger.info("brokers: " + brokers + " group_id: " + conf.getString("idmapping_notice_iframe.group") + " topic: " + conf.getString("idmapping_notice_iframe.topic"))
    val recordDStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtil.createStream(ssc,
      brokers, conf.getString("idmapping_notice_iframe.group"),
      conf.getString("idmapping_notice_iframe.topic").split(","))
    import spark.implicits._
    recordDStream.foreachRDD(rdd => {
      val offRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
      rdd.map(_.value()).map(line => {
        /** 得到消息所带的时间戳 */
        val message_time = JSON.parseObject(line).getString("message_timestamp")
        /** 得到消息所带的消息唯一id */
        val msgid = JSON.parseObject(line).getString("message_id")
        val data: JSONArray = JSON.parseObject(line).getJSONArray("data")
        val rest_json = for (i <- 0 until data.size()) yield {
          val opType = data.getJSONObject(i).getString("op_type")
          val cdmid = data.getJSONObject(i).getJSONObject("data").getString("cdmid")
          val result_json: String = get_result(message_time, msgid, opType, cdmid)
          result_json
        }

        rest_json

      }).toDF("result").coalesce(1).write.format("parquet")
        .mode(SaveMode.Append).save("/tmp/logs/rt_change_notice")

      recordDStream.asInstanceOf[CanCommitOffsets].commitAsync(offRanges)
    })

    ssc.start()
    ssc.awaitTermination()
  }

  /** 把数据组装成json */
  def get_result(messge_time: String, message_id: String, opType: String, cdmid: String) = {
    //    val timestamp = System.currentTimeMillis()
    val now = ZonedDateTime.now
    val formatter = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss")
    val timestamp = formatter.format(now)
    val profileType = "CUSTOMER"
    implicit val formats = Serialization.formats(NoTypeHints)
    val data_map = Map(
      /** 消息id */
      "msgid" -> message_id,
      /** 从上游的kakfa拿到的消息时间戳 */
      "msgtimestamp" -> messge_time,
      /** 往下游kafka推数据的时间戳 */
      "timestamp" -> timestamp,
      /** CUSTOMER */
      "profileType" -> profileType,
      /**cdmid的值*/
      "cdmid" -> cdmid,
      /** idmapping数据的操作类型 */
      "op_type" -> opType
    )
    val result: String = Serialization.write(data_map)
    result
  }

  def main(args: Array[String]): Unit = {

    val spark = SparkSession.builder().appName("kafka_rt_2hdfs")
      .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
      .config("spark.sql.parquet.writeLegacyFormat", "true")
      .config("spark.sql.sources.partitionColumnTypeInference.enabled", "true")
      .config("mergeSchema", "true")
      .config("spark.sql.hive.convertMetastoreParquet", "false")
      .config("spark.streaming.stopGracefullyOnShutdown", "true")
      .config("spark.streaming.backpressure.enabled", "true")
      .config("spark.streaming.stopGracefullyOnShutdown", "true")
      .getOrCreate()

    val ssc: StreamingContext = new StreamingContext(spark.sparkContext, Seconds(10))

    rt_change_notice(spark, ssc)
    spark.stop()
  }
}

其中,代码中注释的方法rt_change_notice,空文件不落地,没有注释的rt_change_notice方法,在落地的时候会产生空文件(我的这个程序是为了测试人员使用,是保证所有的目录,包括空文件,所以我在此处执行)

是通过spark streaming、kafka与DataFrame相结合,借助SaveMode.Append的方式把数据实时存入hdfs中

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值