【Spark项目实战】使用spark分析日志文件(json格式)

14 篇文章 0 订阅
10 篇文章 1 订阅

准备阶段

启动好spark-shell的虚拟机
一份op.log文件,如下图所示:
在这里插入图片描述

导包

import spark.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql._

整合格式

不难发现,一整条信息是用|分隔的,我们来读取一下

val fileRDD=sc.textFile("hdfs://HadoopY:9000/kb09File/op.log")

在这里插入图片描述

val jsonStringRDD=fileRDD.map(x=>x.split('|')).map(x=>(x(0),x(1)))

在这里插入图片描述

这里可以看出来在大括号前有一串数字,是该条信息的ID,我们需要将它合并到大括号里

val jsonRDD=jsonStringRDD.map(x=>{var jsonStr=x._2; jsonStr=jsonStr.substring(0,jsonStr.length-1); jsonStr=jsonStr+",\"id\":\""+x._1+"\"}";jsonStr})

在这里插入图片描述

转入DataFrame

val jsonDF=jsonRDD.toDF

在这里插入图片描述

分解第一层json列

val jsonDF2 = jsonDF.select(get_json_object($"value","$.cm").as("cm"),
   get_json_object($"value","$.ap").as("ap"),
   get_json_object($"value","$.et").as("et"),
   get_json_object($"value","$.id").as("id"))

在这里插入图片描述

分解cm列

val jsonDF3=jsonDF2.select($"id",$"ap",get_json_object($"cm","$.ln").alias("ln"),get_json_object($"cm","$.sv").alias("sv"),get_json_object($"cm","$.os").alias("os"),get_json_object($"cm","$.g").alias("g"),get_json_object($"cm","$.mid").alias("mid"),get_json_object($"cm","$.nw").alias("nw"),get_json_object($"cm","$.l").alias("l"),get_json_object($"cm","$.vc").alias("vc"),get_json_object($"cm","$.hw").alias("hw"),get_json_object($"cm","$.ar").alias("ar"),get_json_object($"cm","$.uid").alias("uid"),get_json_object($"cm","$.t").alias("t"),get_json_object($"cm","$.la").alias("la"),get_json_object($"cm","$.md").alias("md"),get_json_object($"cm","$.vn").alias("vn"),get_json_object($"cm","$.ba").alias("ba"),get_json_object($"cm","$.sr").alias("sr"),$"et")

在这里插入图片描述

分解et列

val jsonDF4=jsonDF3.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",
from_json($"et",ArrayType(StructType(StructField("ett",StringType)::StructField("en",StringType)::StructField("kv",StringType)::Nil))).alias("event"))

在这里插入图片描述
使用explode并取别名

val jsonDF5 =jsonDF4.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",explode($"event").alias("event"))

在这里插入图片描述
注意这里和jsonDF4的区别

分解kv列

kv列下有很多种类的en列,对应后面的列也不同,所以我们需要使用filter来分类讨论

loading

val jsonLoading=jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").filter($"en"==="loading")

val jsonLoadingArray=jsonLoading.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",from_json($"kv",ArrayType(StructType(StructField("extend2",StringType)::StructField("loading_time",StringType)::StructField("action",StringType)::StructField("extend1",StringType)::StructField("type",StringType)::StructField("loading_way",StringType)::Nil))).alias("kv"))

val jsonLoadingExplode=jsonLoadingArray.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",explode($"kv").alias("kv"))

val jsonLoadingFinal=jsonLoadingExplode.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv.extend2",$"kv.loading_time",$"kv.action",$"kv.extend1",$"kv.type",$"kv.loading_way")

在这里插入图片描述

ad

val jsonAd=jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").filter($"en"==="ad")

val jsonAdArray=jsonAd.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",from_json($"kv",ArrayType(StructType(StructField("activityId",StringType)::StructField("displayMills",StringType)::StructField("entry",StringType)::StructField("action",StringType)::StructField("contentType",StringType)::Nil))).alias("kv"))

val jsonAdExplode=jsonAdArray.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",explode($"kv").alias("kv"))

val jsonAdFinal=jsonAdExplode.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv.activityId",$"kv.displayMills",$"kv.entry",$"kv.action",$"kv.contentType")

在这里插入图片描述

notification

val jsonNoti=jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").filter($"en"==="notification")

val jsonNotiArray=jsonNoti.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",from_json($"kv",ArrayType(StructType(StructField("ap_time",StringType)::StructField("action",StringType)::StructField("type",StringType)::StructField("content",StringType)::Nil))).alias("kv"))

val jsonNotiExplode=jsonNotiArray.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",explode($"kv").alias("kv"))

val jsonNotiFinal=jsonNotiExplode.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv.ap_time",$"kv.action",$"kv.type",$"kv.content")

在这里插入图片描述

active_background

val jsonAB=jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").filter($"en"==="active_background")

val jsonABArray=jsonAB.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",from_json($"kv",ArrayType(StructType(StructField("active_source",StringType)::Nil))).alias("kv"))

val jsonABExplode=jsonABArray.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",explode($"kv").alias("kv"))

val jsonABFinal=jsonABExplode.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv.active_source")

在这里插入图片描述

comment

val jsonComment=jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").filter($"en"==="comment")

val jsonCommentArray=jsonComment.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",from_json($"kv",ArrayType(StructType(StructField("p_comment_id",StringType)::StructField("addtime",StringType)::StructField("praise_count",StringType)::StructField("other_id",StringType)::StructField("comment_id",StringType)::StructField("reply_count",StringType)::StructField("userid",StringType)::StructField("content",StringType)::Nil))).alias("kv"))

val jsonCommentExplode=jsonCommentArray.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",explode($"kv").alias("kv"))

val jsonCommentFinal=jsonCommentExplode.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv.p_comment_id",$"kv.addtime",$"kv.praise_count",$"kv.other_id",$"kv.comment_id",$"kv.reply_count",$"kv.userid",$"kv.content")

在这里插入图片描述

praise

val jsonPraise=jsonDF5.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"event.ett",$"event.en",$"event.kv").filter($"en"==="praise")

val jsonPraiseArray=jsonPraise.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",from_json($"kv",ArrayType(StructType(StructField("target_id",StringType)::StructField("id",StringType)::StructField("type",StringType)::StructField("add_time",StringType)::StructField("userid",StringType)::Nil))).alias("kv"))

val jsonPraiseExplode=jsonPraiseArray.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",explode($"kv").alias("kv"))

val jsonPraiseFinal=jsonPraiseExplode.select($"id",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",$"kv.target_id",$"kv.id",$"kv.type",$"kv.add_time",$"kv.userid")

在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值