SparkSreraming读kafka数据(json复杂格式)

4 篇文章 0 订阅

项目简介:源端改造,使用ogg从oracle抽数据到kafka,需要自己写代码解析kafka中ogg数据,数据格式比较复杂(json嵌套),需要二次解析!

ogg某张表的数据格式:(before是修改之前的数据,after是修改之后的数据)

{"table":"SBPOPT.TT_REPAIR_PART","op_type":"U","op_ts":"2020-04-30 02:48:47.001585","current_ts":"2020-04-30T10:48:55.043000","pos":"00000000010000255954","before":{"COMMISSION_NO":"200607-0006","BILL_NO":"200607#0007","SST_CODE":"74327050","PART_CODE":"034 115 561 A","PART_NAME":"DZ","REPAIR_TYPE":"","PART_NUM":1.00,"DECODE":null,"AMOUNT":23.00,"SALE_PRICE":23.00,"WARRANTY_SIGN":"0  ","WARRANTY_REASON":null,"COMPANY_ID":0,"FLAG":"1 ","PART_DATE":"2006-07-01 18:26:21","VIN":"LSVHH133522135440","KD_CODE":null,"IS_V":null,"NEEDLESS_REPAIR":null,"IS_PRING":null,"ACTIVITY_CODE":null,"ACTIVITY_TYPE":null,"REPAIR_DATE":"2006-07-01 11:15:00","PART_SOFTWARE_VER":null,"SAFETY_PART_SERIAL":null,"NEGABALANCE_SIGN":"F","STORAGE_CODE":null,"STORAGE":null,"BUY_PRICE":null,"AVERAGE_PRICE":null,"IS_STAFF_YOUHUI":null,"PART_TYPE":null,"PART_DISCOUNT_FEE":null,"PART_AMOUNT":null,"PART_DISCOUNT":null,"IS_PRE_SERVICE_PART":null,"PART_NATURE":null,"PRE_ITEM_ID":null,"WARN_INFO_TYPE":null,"ID":null,"NEW_PRE_ITEM_ID":null,"PART_ORDER_PIRCE":null,"PART_ORDER_AMOUNT":null,"PPSO_PACKAGE_ID":null,"PPSO_PACKAGE_NAME":null},"after":{"COMMISSION_NO":"200607-0006","BILL_NO":"200607#0007","SST_CODE":"74327050","PART_CODE":"034 115 561 A","PART_NAME":"DZ","REPAIR_TYPE":"","PART_NUM":20.00,"DECODE":null,"AMOUNT":23.00,"SALE_PRICE":23.00,"WARRANTY_SIGN":"0  ","WARRANTY_REASON":null,"COMPANY_ID":0,"FLAG":"1 ","PART_DATE":"2006-07-01 18:26:21","VIN":"LSVHH133522135440","KD_CODE":null,"IS_V":null,"NEEDLESS_REPAIR":null,"IS_PRING":null,"ACTIVITY_CODE":null,"ACTIVITY_TYPE":null,"REPAIR_DATE":"2006-07-01 11:15:00","PART_SOFTWARE_VER":null,"SAFETY_PART_SERIAL":null,"NEGABALANCE_SIGN":"F","STORAGE_CODE":null,"STORAGE":null,"BUY_PRICE":null,"AVERAGE_PRICE":null,"IS_STAFF_YOUHUI":null,"PART_TYPE":null,"PART_DISCOUNT_FEE":null,"PART_AMOUNT":null,"PART_DISCOUNT":null,"IS_PRE_SERVICE_PART":null,"PART_NATURE":null,"PRE_ITEM_ID":null,"WARN_INFO_TYPE":null,"ID":null,"NEW_PRE_ITEM_ID":null,"PART_ORDER_PIRCE":null,"PART_ORDER_AMOUNT":null,"PPSO_PACKAGE_ID":null,"PPSO_PACKAGE_NAME":null}}

spark:2.1.0版本
kafka:0.9.0版本

<dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming_2.11</artifactId>
        <version>2.1.0</version>
</dependency>
<dependency>
        <groupId>org.apache.spark</groupId>
        <artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
         <version>2.1.0</version>
</dependency>
<dependency>
            <groupId>net.minidev</groupId>
            <artifactId>json-smart</artifactId>
            <version>2.3</version>
</dependency>

解析逻辑大致是:
op_type=I 新增一条数据,op_type=I
op_type=U 新增两条数据,op_type=I和op_type=D
op_type=D 新增一条数据,op_type=D
计算逻辑大致是:
op_type=I 新增一条数据,累加sum(AMOUNT)
op_type=U 新增两条数据,累加sum(AMOUNT,-AMOUNT2)
op_type=D 新增一条数据,累加sum(-AMOUNT)
具体代码:

import net.minidev.json.JSONObject
import net.minidev.json.parser.JSONParser
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Dataset, SQLContext, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.{Seconds, StreamingContext}

/**
  * Created by LHX on 2020/5/13 9:49.
  * 本地测试读kafka数据,二次解析数据
  */
object ReadJsonFile {
    def main(args: Array[String]): Unit = {
        val conf = new SparkConf().setMaster("local[2]").setAppName("ReadJsonFile")
        val sc=new SparkContext(conf)
        val ssc = new StreamingContext(sc, Seconds(10))
        val kafkaParams = Map[String, Object](
            "bootstrap.servers" -> "localhost:9092",
            "key.deserializer" -> classOf[StringDeserializer],
            "value.deserializer" -> classOf[StringDeserializer],
            "group.id" -> "spark_streaming",
            "auto.offset.reset" -> "earliest",
            "enable.auto.commit" -> (false: java.lang.Boolean)
        )

        val topics = Array("test_topic")
        val dStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
            ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))

        dStream.foreachRDD(rdd => {
            //得到sqlContext
                val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext)
                import sqlContext.implicits._
            val frameRdd: RDD[String] = rdd.flatMap(line => {
                                    //将Kafka的每一条消息解析为JSON格式数据
                                    println("===" + line.value)
                                    val jsonParser = new JSONParser()
                                    val jsonObj: JSONObject = jsonParser.parse(line.value()).asInstanceOf[JSONObject]
                                    val op_type = jsonObj.get("op_type").toString
                                    if (op_type == "U") {
                                        val op_ts = jsonObj.getAsString("op_ts")
                                        val before = jsonObj.getAsString("before").dropRight(1) + ",\"op_type\":\"D\",\"ts\":\"" + op_ts + "a\"}"
                                        val after = jsonObj.getAsString("after").dropRight(1) + ",\"op_type\":\"I\",\"ts\":\"" + op_ts + "b\"}"
                                        Array(before, after)
                                    }
                                    else if (op_type == "I") {
                                        val op_ts = jsonObj.getAsString("op_ts")
                                        val after = jsonObj.getAsString("after").dropRight(1) + ",\"op_type\":\"I\",\"ts\":\"" + op_ts + "\"}"
                                        Array(after)
                                    }
                                    else {
                                        val op_ts = jsonObj.getAsString("op_ts")
                                        val before = jsonObj.getAsString("before").dropRight(1) + ",\"op_type\":\"D\",\"ts\":\"" + op_ts + "\"}"
                                        Array(before)
                                    }
                                })
                //处理json,注册临时表
            sqlContext.read.json(frameRdd).createOrReplaceTempView("a")
            val df = sqlContext.sql("select * from a")
            df.show(true)
            })
        ssc.start()
        ssc.awaitTermination()
    }
}
object SQLContextSingleton {
    @transient  private var instance: SQLContext = _
    def getInstance(sparkContext: SparkContext): SQLContext = {
        if (instance == null) {
            instance = new SQLContext(sparkContext)
        }
        instance
    }
}

访问本地kafka集群,测试结果如下:
在这里插入图片描述

  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值