目录
点击下载样例文件
提取码: vjex
一、Json文件日志
使用数据 op.log文件
注:spark-shell界面操作
1.1 清洗数据
数据样例展示:cm ap et id
"cm":
{
"ln":"-55.0",
"sv":"V2.9.6",
"os":"8.0.4",
"g":"C6816QZ0@gmail.com",
"mid":"489",
"nw":"3G",
"l":"es",
"vc":"4",
"hw":"640*960",
"ar":"MX",
"uid":"489",
"t":"1593123253541",
"la":"5.2",
"md":"sumsung-18",
"vn":"1.3.4",
"ba":"Sumsung",
"sr":"I"},
"ap":"app",
"et":[
{
"ett":"1593050051366","en":"loading","kv":{
"extend2":"","loading_time":"14","action":"3","extend1":"","type":"2","type1":"201","loading_way":"1"}},
{
"ett":"1593108791764","en":"ad","kv":{
"activityId":"1","displayMills":"78522","entry":"1","action":"1","contentType":"0"}},{
"ett":"1593111271266","en":"notification","kv":{
"ap_time":"1593097087883","action":"1","type":"1","content":""}},{
"ett":"1593066033562","en":"active_background","kv":{
"active_source":"3"}},
{
"ett":"1593135644347","en":"comment","kv":{
"p_comment_id":1,"addtime":"1593097573725","praise_count":973,"other_id":5,"comment_id":9,"reply_count":40,"userid":7,"content":"辑赤蹲慰鸽抿肘捎"}}]
"id":"1593136280858"
//日志上传hdfs
hdfs dfs -put /opt/op.log /logFile/
//开启spark
//读取日志文件
val lines = sc.textFile("hdfs://hadoop001:9000/logFile/op.log")
//将id拆分出来
val rdd = lines.map(_.split('|')).map(x=>(x(0),x(1)))
//将id补到json格式中
val jsonRdd = rdd.map(x=>{
var jsonStr = x._2
jsonStr = jsonStr.substring(0,jsonStr.length-1)
jsonStr + ",\"id\":\""+ x._1 +"\"}"
})
1.2 rdd转dataFrame格式
//导包
import org.apache.spark.sql.types._
import org.apache.spark.sql.functions._
//将rdd转dataFrame
val jsonDF = jsonRdd.toDF
//将json字符串 {"cm":"--" ,"ap":"--","et":"--","id":"--"} 结构化
val df = jsonDF.select(get_json_object($"value","$.cm").as("cm"),
get_json_object($"value","$.ap").as("ap"),
get_json_object($"value","$.et").as("et"),
get_json_object($"value","$.id").as("id"))
1.3 提取’cm’中信息
val df2 = df.select($"id",$"ap",
get_json_object($"cm","$.ln").as("ln"),
get_json_object($"cm","$.sv").as("sv"),
get_json_object($"cm","$.os").as("os"),
get_json_object($"cm","$.g").as("g"),
get_json_object($"cm","$.mid").as("mid"),
get_json_object($"cm","$.l").as("l"),
get_json_object($"cm","$.vc").as("vc"),
get_json_object($"cm","$.hw").as("hw"),
get_json_object($"cm","$.ar").as("ar"),
get_json_object($"cm","$.uid").as("uid"),
get_json_object($"cm","$.t").as("t"),
get_json_object