项目简介:源端改造,使用ogg从oracle抽数据到kafka,需要自己写代码解析kafka中ogg数据,数据格式比较复杂(json嵌套),需要二次解析!
ogg某张表的数据格式:(before是修改之前的数据,after是修改之后的数据)
{"table":"SBPOPT.TT_REPAIR_PART","op_type":"U","op_ts":"2020-04-30 02:48:47.001585","current_ts":"2020-04-30T10:48:55.043000","pos":"00000000010000255954","before":{"COMMISSION_NO":"200607-0006","BILL_NO":"200607#0007","SST_CODE":"74327050","PART_CODE":"034 115 561 A","PART_NAME":"DZ","REPAIR_TYPE":"","PART_NUM":1.00,"DECODE":null,"AMOUNT":23.00,"SALE_PRICE":23.00,"WARRANTY_SIGN":"0 ","WARRANTY_REASON":null,"COMPANY_ID":0,"FLAG":"1 ","PART_DATE":"2006-07-01 18:26:21","VIN":"LSVHH133522135440","KD_CODE":null,"IS_V":null,"NEEDLESS_REPAIR":null,"IS_PRING":null,"ACTIVITY_CODE":null,"ACTIVITY_TYPE":null,"REPAIR_DATE":"2006-07-01 11:15:00","PART_SOFTWARE_VER":null,"SAFETY_PART_SERIAL":null,"NEGABALANCE_SIGN":"F","STORAGE_CODE":null,"STORAGE":null,"BUY_PRICE":null,"AVERAGE_PRICE":null,"IS_STAFF_YOUHUI":null,"PART_TYPE":null,"PART_DISCOUNT_FEE":null,"PART_AMOUNT":null,"PART_DISCOUNT":null,"IS_PRE_SERVICE_PART":null,"PART_NATURE":null,"PRE_ITEM_ID":null,"WARN_INFO_TYPE":null,"ID":null,"NEW_PRE_ITEM_ID":null,"PART_ORDER_PIRCE":null,"PART_ORDER_AMOUNT":null,"PPSO_PACKAGE_ID":null,"PPSO_PACKAGE_NAME":null},"after":{"COMMISSION_NO":"200607-0006","BILL_NO":"200607#0007","SST_CODE":"74327050","PART_CODE":"034 115 561 A","PART_NAME":"DZ","REPAIR_TYPE":"","PART_NUM":20.00,"DECODE":null,"AMOUNT":23.00,"SALE_PRICE":23.00,"WARRANTY_SIGN":"0 ","WARRANTY_REASON":null,"COMPANY_ID":0,"FLAG":"1 ","PART_DATE":"2006-07-01 18:26:21","VIN":"LSVHH133522135440","KD_CODE":null,"IS_V":null,"NEEDLESS_REPAIR":null,"IS_PRING":null,"ACTIVITY_CODE":null,"ACTIVITY_TYPE":null,"REPAIR_DATE":"2006-07-01 11:15:00","PART_SOFTWARE_VER":null,"SAFETY_PART_SERIAL":null,"NEGABALANCE_SIGN":"F","STORAGE_CODE":null,"STORAGE":null,"BUY_PRICE":null,"AVERAGE_PRICE":null,"IS_STAFF_YOUHUI":null,"PART_TYPE":null,"PART_DISCOUNT_FEE":null,"PART_AMOUNT":null,"PART_DISCOUNT":null,"IS_PRE_SERVICE_PART":null,"PART_NATURE":null,"PRE_ITEM_ID":null,"WARN_INFO_TYPE":null,"ID":null,"NEW_PRE_ITEM_ID":null,"PART_ORDER_PIRCE":null,"PART_ORDER_AMOUNT":null,"PPSO_PACKAGE_ID":null,"PPSO_PACKAGE_NAME":null}}
spark:2.1.0版本
kafka:0.9.0版本
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-streaming-kafka-0-10_2.11</artifactId>
<version>2.1.0</version>
</dependency>
<dependency>
<groupId>net.minidev</groupId>
<artifactId>json-smart</artifactId>
<version>2.3</version>
</dependency>
解析逻辑大致是:
op_type=I 新增一条数据,op_type=I
op_type=U 新增两条数据,op_type=I和op_type=D
op_type=D 新增一条数据,op_type=D
计算逻辑大致是:
op_type=I 新增一条数据,累加sum(AMOUNT)
op_type=U 新增两条数据,累加sum(AMOUNT,-AMOUNT2)
op_type=D 新增一条数据,累加sum(-AMOUNT)
具体代码:
import net.minidev.json.JSONObject
import net.minidev.json.parser.JSONParser
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{Dataset, SQLContext, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.{Seconds, StreamingContext}
/**
* Created by LHX on 2020/5/13 9:49.
* 本地测试读kafka数据,二次解析数据
*/
object ReadJsonFile {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setMaster("local[2]").setAppName("ReadJsonFile")
val sc=new SparkContext(conf)
val ssc = new StreamingContext(sc, Seconds(10))
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "localhost:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "spark_streaming",
"auto.offset.reset" -> "earliest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("test_topic")
val dStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams))
dStream.foreachRDD(rdd => {
//得到sqlContext
val sqlContext = SQLContextSingleton.getInstance(rdd.sparkContext)
import sqlContext.implicits._
val frameRdd: RDD[String] = rdd.flatMap(line => {
//将Kafka的每一条消息解析为JSON格式数据
println("===" + line.value)
val jsonParser = new JSONParser()
val jsonObj: JSONObject = jsonParser.parse(line.value()).asInstanceOf[JSONObject]
val op_type = jsonObj.get("op_type").toString
if (op_type == "U") {
val op_ts = jsonObj.getAsString("op_ts")
val before = jsonObj.getAsString("before").dropRight(1) + ",\"op_type\":\"D\",\"ts\":\"" + op_ts + "a\"}"
val after = jsonObj.getAsString("after").dropRight(1) + ",\"op_type\":\"I\",\"ts\":\"" + op_ts + "b\"}"
Array(before, after)
}
else if (op_type == "I") {
val op_ts = jsonObj.getAsString("op_ts")
val after = jsonObj.getAsString("after").dropRight(1) + ",\"op_type\":\"I\",\"ts\":\"" + op_ts + "\"}"
Array(after)
}
else {
val op_ts = jsonObj.getAsString("op_ts")
val before = jsonObj.getAsString("before").dropRight(1) + ",\"op_type\":\"D\",\"ts\":\"" + op_ts + "\"}"
Array(before)
}
})
//处理json,注册临时表
sqlContext.read.json(frameRdd).createOrReplaceTempView("a")
val df = sqlContext.sql("select * from a")
df.show(true)
})
ssc.start()
ssc.awaitTermination()
}
}
object SQLContextSingleton {
@transient private var instance: SQLContext = _
def getInstance(sparkContext: SparkContext): SQLContext = {
if (instance == null) {
instance = new SQLContext(sparkContext)
}
instance
}
}
访问本地kafka集群,测试结果如下: