JSON数据格式
{
"header": {
"traceId": "06ad872d5d5bfa0d",
"appName": "zeus-merchant",
"deviceType": null,
"version": null,
"userAgent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36",
"ip": "119.162.98.224",
"sysTime": "2021-06-11 13:36:27"
},
"body": [
{
"txnType": 1,
"txnCode": "10100720012021061111522994861596",
"txnSubCode": "18399",
"refCode": "10200720012021061113362702629760",
"txnValue": null,
"event": 1,
"subEvent": null,
"userId": "307426",
"extend": null
}
]
}
代码:
import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.SparkConf
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.sql.{Row, SparkSession, functions}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import scala.collection.mutable.ArrayBuffer
object cainiao {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("zeus_mis streaming")
conf.setMaster("local")
conf.set("spark.testing.memory","2147480000")
conf.set("spark.executor.instances", "4")
conf.set("spark.executor.memory", "12g")
conf.set("spark.driver.allowMultipleContexts", "true")
conf.set("spark.cores.max","16")
conf.set("spark.dynamicAllocation.enabled","false")
conf.set("hive.metastore.uris", "thrift://172.16.9.241:9083")
// conf.set("hive.metastore.uris", "thrift://192.168.249.212:9083")
conf.set("spark.sql.warehouse.dir", "hdfs://172.16.9.241:8020/user/hive/warehouse/ods.db")
// conf.set("spark.sql.warehouse.dir", "hdfs://192.168.249.212:8020/user/hive/warehouse/ods.db")
// spark sql
val spark = SparkSession
.builder()
.config(conf)
.enableHiveSupport()
.getOrCreate()
// streaming connect
val ssc = new StreamingContext(spark.sparkContext, Seconds(5))
ssc.sparkContext.setLogLevel("ERROR")
// val bootstrapServers = "172.16.9.240:9092,172.16.9.236:9092,172.16.9.237:9092"
val bootstrapServers = "192.168.249.177:9092,192.168.249.128:9092,192.168.249.144:9092"
val groupId = "test-20"
val topicName = "ZEUS_MIS"
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> bootstrapServers,
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupId,
"auto.offset.reset" -> "latest", //latest自动重置偏移量为最新的偏移量 earliest
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val kafkaTopicDS = KafkaUtils.createDirectStream(
ssc,
LocationStrategies.PreferConsistent,
ConsumerStrategies.Subscribe[String, String](Set(topicName), kafkaParams)
)
// transform
val lineDStream = kafkaTopicDS.map(_.value())
lineDStream.print()
import spark.implicits._
val data = lineDStream.foreachRDD(rdd=> {
rdd.map(x => {
val json = JSON.parseObject(x)
val header = json.getJSONObject("header")
var appName = header.get("appName")
if (appName == null) {
appName = "null"
}
var ip = header.get("ip")
if (ip == null) {
ip = "null"
}
var sysTime = header.get("sysTime")
if (sysTime == null) {
sysTime = "null"
}
var userAgent = header.get("userAgent")
if (userAgent == null) {
userAgent = "null"
}
var traceId = header.get("traceId")
if (traceId == null) {
traceId = "null"
}
var deviceType = header.get("deviceType")
if (deviceType == null) {
deviceType = "null"
}
var version = header.get("version")
if (version == null) {
version = "null"
}
val body = json.getJSONArray("body")
val body_arr = ArrayBuffer[(String, String, String, String, String, String, String, String, String)]()
for (i <- 0 until body.size()) {
val obj = body.getJSONObject(i) //下表为 i的json对象
var txnType = obj.getString("txnType")
if (txnType == null) {
txnType = "null"
}
var txnCode = obj.getString("txnCode")
if (txnCode == null) {
txnCode = "null"
}
var txnSubCode = obj.getString("txnSubCode")
if (txnSubCode == null) {
txnSubCode = "null"
}
var refCode = obj.getString("refCode")
if (refCode == null) {
refCode = "null"
}
var event = obj.getString("event")
if (event == null) {
event = "null"
}
var userId = obj.getString("userId")
if (userId == null) {
userId = "null"
}
var extend = obj.getString("extend")
if (extend == null) {
extend = "null"
}
var subEvent = obj.getString("event_track")
if (subEvent == null) {
subEvent = "null"
}
var txnValue = obj.getString("txnValue")
if (txnValue == null) {
txnValue = "null"
}
body_arr += ((txnType, txnCode, txnSubCode, refCode, txnValue, event, subEvent, userId, extend))
}
((traceId, appName, deviceType, version, ip, sysTime, userAgent), body_arr)
})
.flatMap(x => {
val data_arr = ArrayBuffer[(String, String, String, String, String, String, String, String, String, String, String, String, String, String, String, String)]()
val header = x._1
val body_arr = x._2.toArray
for (elem <- body_arr) {
data_arr += ((header._1.toString, header._2.toString, header._3.toString, header._4.toString, header._5.toString, header._6.toString, header._7.toString,
elem._1, elem._2, elem._3, elem._4, elem._5, elem._6, elem._7, elem._8, elem._9))
}
data_arr
}).toDF("traceId", "appName", "deviceType", "version", "ip", "sysTime", "userAgent","txnType", "txnCode", "txnSubCode", "refCode", "txnValue", "event", "subEvent", "userId", "extend").show()
})
ssc.start()
ssc.awaitTermination()
}
}