使用spark分析日志文件
准备阶段
启动好spark-shell的虚拟机
一份op.log文件,如下图所示:
导包
import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._
整合格式
不难发现,一整条信息是用|
分隔的,我们来读取一下
val fileRDD=sc.textFile("hdfs://HadoopY:9000/kb09File/op.log")
val jsonStringRDD=fileRDD.map(x=>x.split('|')).map(x=>(x(0),x(1)))
这里可以看出来在大括号前有一串数字,是该条信息的ID,我们需要将它合并到大括号里
val jsonRDD=jsonStringRDD.map(x=>{var jsonStr=x._2; jsonStr=jsonStr.substring(0,jsonStr.length-1); jsonStr=jsonStr+",\"id\":\""+x._1+"\"}";jsonStr})
转入DataFrame
val jsonDF=jsonRDD.toDF
分解第一层json列
val jsonDF2 = jsonDF.select(get_json_object($"value","$.cm").as("cm"),
get_json_object($"value","$.ap").as("ap"),
get_json_object($"value","$.et").as("et"),
get_json_object($"value","$.id").as("id"))
分解cm列
val jsonDF3=jsonDF2.select($"id",$"ap",get_json_object($"cm","$.ln").alias("ln"),get_json_object($"cm","$.sv").alias("sv"),get_json_object($"cm","$.os").alias("os"),get_json_object($"cm","$.g").alias("g"),get_json_object($"cm","$.mid").alias("mid"),get_json_object($"cm","$.nw").alias("nw"),get_json_object($"cm","$.l").alias("l"),get_json_object($"cm","$.vc").alias("vc"),get_json_object($"cm","$.hw").alias("hw"),get_json_object($"cm","$.ar").alias("ar"),get_json_object($"cm","$.uid").alias("uid"),get_json_object($"cm","$.t").alias("t"),get_json_object($"cm","$.la").alias("la"),get_json_object($"cm","$.md").alias("md"),get_json_object($"cm","$.vn").alias("vn"),get_json_object($"cm","$.ba").alias("ba"),get_json_object($"cm","$.sr").alias("sr"),$"et")
分解et列
val jsonDF4=jsonDF3.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",
from_json($"et",ArrayType(StructType(StructField("ett",StringType)::StructField("en",StringType)::StructField("kv",StringType)::Nil))).alias("event"))
使用explode并取别名
val jsonDF5 =jsonDF4.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",explode($"event").alias("event"))
注意这里和jsonDF4的区别
分解kv列
kv列下有很多种类的en
列,对应后面的列也不同,所以我们需要使用filter来分类讨论
loading
val jsonLoading=jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").filter($"en"==="loading")
val jsonLoadingArray=jsonLoading.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",from_json($"kv",ArrayType(StructType(StructField("extend2",StringType)::StructField("loading_time",StringType)::StructField("action",StringType)::StructField("extend1",StringType)::StructField("type",StringType)::StructField("loading_way",StringType)::Nil))).alias("kv"))
val jsonLoadingExplode=jsonLoadingArray.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",explode($"kv").alias("kv"))
val jsonLoadingFinal=jsonLoadingExplode.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv.extend2",$"kv.loading_time",$"kv.action",$"kv.extend1",$"kv.type",$"kv.loading_way")
ad
val jsonAd=jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").filter($"en"==="ad")
val jsonAdArray=jsonAd.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",from_json($"kv",ArrayType(StructType(StructField("activityId",StringType)::StructField("displayMills",StringType)::StructField("entry",StringType)::StructField("action",StringType)::StructField("contentType",StringType)::Nil))).alias("kv"))
val jsonAdExplode=jsonAdArray.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",explode($"kv").alias("kv"))
val jsonAdFinal=jsonAdExplode.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv.activityId",$"kv.displayMills",$"kv.entry",$"kv.action",$"kv.contentType")
notification
val jsonNoti=jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").filter($"en"==="notification")
val jsonNotiArray=jsonNoti.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",from_json($"kv",ArrayType(StructType(StructField("ap_time",StringType)::StructField("action",StringType)::StructField("type",StringType)::StructField("content",StringType)::Nil))).alias("kv"))
val jsonNotiExplode=jsonNotiArray.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",explode($"kv").alias("kv"))
val jsonNotiFinal=jsonNotiExplode.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv.ap_time",$"kv.action",$"kv.type",$"kv.content")
active_background
val jsonAB=jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").filter($"en"==="active_background")
val jsonABArray=jsonAB.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",from_json($"kv",ArrayType(StructType(StructField("active_source",StringType)::Nil))).alias("kv"))
val jsonABExplode=jsonABArray.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",explode($"kv").alias("kv"))
val jsonABFinal=jsonABExplode.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv.active_source")
comment
val jsonComment=jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").filter($"en"==="comment")
val jsonCommentArray=jsonComment.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",from_json($"kv",ArrayType(StructType(StructField("p_comment_id",StringType)::StructField("addtime",StringType)::StructField("praise_count",StringType)::StructField("other_id",StringType)::StructField("comment_id",StringType)::StructField("reply_count",StringType)::StructField("userid",StringType)::StructField("content",StringType)::Nil))).alias("kv"))
val jsonCommentExplode=jsonCommentArray.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",explode($"kv").alias("kv"))
val jsonCommentFinal=jsonCommentExplode.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv.p_comment_id",$"kv.addtime",$"kv.praise_count",$"kv.other_id",$"kv.comment_id",$"kv.reply_count",$"kv.userid",$"kv.content")
praise
val jsonPraise=jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").filter($"en"==="praise")
val jsonPraiseArray=jsonPraise.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",from_json($"kv",ArrayType(StructType(StructField("target_id",StringType)::StructField("id",StringType)::StructField("type",StringType)::StructField("add_time",StringType)::StructField("userid",StringType)::Nil))).alias("kv"))
val jsonPraiseExplode=jsonPraiseArray.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",explode($"kv").alias("kv"))
val jsonPraiseFinal=jsonPraiseExplode.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv.target_id",$"kv.id",$"kv.type",$"kv.add_time",$"kv.userid")