scala> val fileRDD=sc.textFile(“hdfs://192.168.116.60:9000/kb09file/op.log”)
scala> val jsonStrRDD=fileRDD.map(x=>x.split(’|’)).map(x=>(x(0),x(1)))
scala> val jsonRDD=jsonStrRDD.map(x=>{var jsonStr=x._2;jsonStr
= jsonStr.substring(0,jsonStr.length-1);jsonStr+",“id”:""+x._1+""}"}) //加双引号才是json字符串 很重要
scala> val jsonDF=jsonRDD.toDF
scala> jsonDF.printSchema
root
|-- value: string (nullable = true)
scala> import spark.implicits._
scala> import org.apache.spark.sql.functions._
scala> import org.apache.spark.sql._
scala> import org.apache.spark.sql.types._
scala> jsonDF.select(get_json_object(
"
v
a
l
u
e
"
,
"
"value","
"value",".cm").alias(“cm”)).printSchema
root
|-- cm: string (nullable = true)
scala> jsonDF.select(get_json_object(
"
v
a
l
u
e
"
,
"
"value","
"value",".cm").alias(“cm”)).show
±-------------------+
| cm|
±-------------------+
|{“ln”:"-55.0",“sv…|
|{“ln”:”-114.9","s…|
±-------------------+
scala> jsonDF.select(get_json_object(
"
v
a
l
u
e
"
,
"
"value","
"value",".cm").alias(“cm”),get_json_object(
"
v
a
l
u
e
"
,
"
"value","
"value",".ap").alias(“ap”),
get_json_object(
"
v
a
l
u
e
"
,
"
"value","
"value",".et").alias(“et”),get_json_object(
"
v
a
l
u
e
"
,
"
"value","
"value",".id").alias(“id”)).show()
±-------------------±–±-------------------±------------+
| cm| ap| et| id|
±-------------------±–±-------------------±------------+
|{“ln”:"-55.0",“sv…|app|[{“ett”:“15930500…|1593136280858|
|{“ln”:”-114.9”,"s…|app|[{“ett”:"15930632…|1593136280858|
±-------------------±–±-------------------±------------+
//将json字符串结构化
//表头 cm ap et id
//列 a1 b1 c1 d1
scala> val jsonDF2 = jsonDF.select(get_json_object(
"
v
a
l
u
e
"
,
"
"value","
"value",".cm").alias(“cm”),get_json_object(
"
v
a
l
u
e
"
,
"
"value","
"value",".ap").alias(“ap”)
,get_json_object(
"
v
a
l
u
e
"
,
"
"value","
"value",".et").alias(“et”),get_json_object(
"
v
a
l
u
e
"
,
"
"value","
"value",".id").alias(“id”))
scala> jsonDF2.printSchema
root
|-- cm: string (nullable = true)
|-- ap: string (nullable = true)
|-- et: string (nullable = true)
|-- id: string (nullable = true)
scala> jsonDF2.select(
"
i
d
"
,
"id",
"id",“ap”,
"
c
m
"
,
"cm",
"cm",“et”).show
±------------±–±-------------------±-------------------+
| id| ap| cm| et|
±------------±–±-------------------±-------------------+
|1593136280858|app|{“ln”:"-55.0",“sv…|[{“ett”:“15930500…|
|1593136280858|app|{“ln”:”-114.9”,"s…|[{“ett”:"15930632…|
±------------±–±-------------------±-------------------+
scala> val jsonDF3 = jsonDF2.select(
"
i
d
"
,
"id",
"id",“ap”,get_json_object(
"
c
m
"
,
"
"cm","
"cm",".ln") alias(“ln”),
get_json_object(
"
c
m
"
,
"
"cm","
"cm",".sv") alias(“sv”),get_json_object(
"
c
m
"
,
"
"cm","
"cm",".os") alias(“os”),
get_json_object(
"
c
m
"
,
"
"cm","
"cm",".g") alias(“g”),get_json_object(
"
c
m
"
,
"
"cm","
"cm",".mid") alias(“mid”),
get_json_object(
"
c
m
"
,
"
"cm","
"cm",".nw") alias(“nw”),get_json_object(
"
c
m
"
,
"
"cm","
"cm",".l") alias(“l”),
get_json_object(
"
c
m
"
,
"
"cm","
"cm",".vc") alias(“vc”),get_json_object(
"
c
m
"
,
"
"cm","
"cm",".hw") alias(“hw”),
get_json_object(
"
c
m
"
,
"
"cm","
"cm",".ar") alias(“ar”),get_json_object(
"
c
m
"
,
"
"cm","
"cm",".uid") alias(“uid”),
get_json_object(
"
c
m
"
,
"
"cm","
"cm",".t") alias(“t”),get_json_object(
"
c
m
"
,
"
"cm","
"cm",".la") alias(“la”),
get_json_object(
"
c
m
"
,
"
"cm","
"cm",".md") alias(“md”),get_json_object(
"
c
m
"
,
"
"cm","
"cm",".vn") alias(“vn”),
get_json_object(
"
c
m
"
,
"
"cm","
"cm",".ba") alias(“ba”),get_json_object(
"
c
m
"
,
"
"cm","
"cm",".sr") alias(“sr”),$“et”)
scala> jsonDF3.printSchema
root
|-- id: string (nullable = true)
|-- ap: string (nullable = true)
|-- ln: string (nullable = true)
|-- sv: string (nullable = true)
|-- os: string (nullable = true)
|-- g: string (nullable = true)
|-- mid: string (nullable = true)
|-- nw: string (nullable = true)
|-- l: string (nullable = true)
|-- vc: string (nullable = true)
|-- hw: string (nullable = true)
|-- ar: string (nullable = true)
|-- uid: string (nullable = true)
|-- t: string (nullable = true)
|-- la: string (nullable = true)
|-- md: string (nullable = true)
|-- vn: string (nullable = true)
|-- ba: string (nullable = true)
|-- sr: string (nullable = true)
|-- et: string (nullable = true)
scala> jsonDF3.show
±------------±–±-----±-----±----±-----------------±–±--±–±--±-------±–±--±------------±----±---------±----±------±–±-------------------+
| id| ap| ln| sv| os| g|mid| nw| l| vc| hw| ar|uid| t| la| md| vn| ba| sr| et|
±------------±–±-----±-----±----±-----------------±–±--±–±--±-------±–±--±------------±----±---------±----±------±–±-------------------+
|1593136280858|app| -55.0|V2.9.6|8.0.4|C6816QZ0@gmail.com|489| 3G| es| 4| 640960| MX|489|1593123253541| 5.2|sumsung-18|1.3.4|Sumsung| I|[{“ett”:"15930500…|
|1593136280858|app|-114.9|V2.7.8|8.0.4|NW0S962J@gmail.com|490| 3G| pt| 8|6401136| MX|490|1593121224789|-44.4| Huawei-8|1.0.1| Huawei| O|[{“ett”:"15930632…|
±------------±–±-----±-----±----±-----------------±–±--±–±--±-------±–±--±------------±----±---------±----±------±–±-------------------+
//from_json把字符串
[
{“ett”:“a1”,“en”:“a2”,“kv”:“a3”},
{“ett”:“b1”,“en”:“b2”,“kv”:“b3”},
{“ett”:“c1”,“en”:“c2”,“kv”:“c3”}
] 结构化
ett en kv
a1 a2 a3
b1 b2 b3
c1 c2 c3
val jsonDF4 = jsonDF3.select( " i d " , "id", "id",“ap”, " l n " , "ln", "ln",“sv”, " o s " , "os", "os",“g”, " m i d " , "mid", "mid",“nw”, " l " , "l", "l",“vc”, " h w " , "hw", "hw",“ar”, " u i d " , "uid", "uid",“t”, " l a " , "la", "la",“md”, " v n " , "vn", "vn",“ba”, " s r " , f r o m j s o n ( "sr", from_json( "sr",fromjson(“et”,ArrayType(StructType(StructField(“ett”,StringType)::StructField(“en”,StringType)::StructField(“kv”,StringType)::Nil))).alias(“event”))
jsonDF4大脑想到的结构是这样:
id ap mid nw *** event
1593136280858 app 12345 dsdwe **** [[ett en ev],[b1 b2 b3 ],[c1 c2 c3]]
---------------------思考结束---------------------------
开干。。。。。。。。
第二步
scala> val jsonDF5 = jsonDF4.select( " i d " , "id", "id",“ap”, " l n " , "ln", "ln",“sv”, " o s " , "os", "os",“g”, " m i d " , "mid", "mid",“nw”, " l " , "l", "l",“vc”, " h w " , "hw", "hw",“ar”, " u i d " , "uid", "uid",“t”, " l a " , "la", "la",“md”, " v n " , "vn", "vn",“ba”, " s r " , e x p l o d e ( "sr", explode( "sr",explode(“event”).alias(“event”))