Spark_shell数据清洗案例

Spark_shell数据清洗案例

本文的目的是模拟公司的实际数据清洗
数据来源是我自己用java代码模拟的,模拟代码及数据文件会上传至csdn,可以自由下载
本文的数据格式如下(en事件名称有七种类型)

在这里插入图片描述

在这里插入图片描述
在这里插入图片描述
在这里插入图片描述
在这里插入图片描述

将本地文件上传至sparkRDD

scala> val optionRDD=sc.textFile("/opt/tmp/logs/op.log")
optionRDD: org.apache.spark.rdd.RDD[String] = /opt/tmp/logs/op.log MapPartitionsRDD[1] at textFile at <console>:24

计数

scala> optionRDD.count
res0: Long = 1000

创建头

scala> val schemaString = "time content"
schemaString: String = time content

//切割成两个字段
scala> val fields = schemaString.split(" ")
fields: Array[String] = Array(time, content)

scala> fields.foreach(println)
time
content

导包

scala> import org.apache.spark.sql.types._

将time content 通过map改变结构变成数组

scala>  val fields = schemaString.split(" ").map(fieldName=>StructField(fieldName,StringType,nullable=true))
fields: Array[org.apache.spark.sql.types.StructField] = Array(StructField(time,StringType,true), StructField(content,StringType,true))


scala> fields.foreach(println)
StructField(time,StringType,true)
StructField(content,StringType,true)`

将time content结构化

scala> val schema = StructType(fields)
schema: org.apache.spark.sql.types.StructType = StructType(StructField(time,StringType,true), StructField(content,StringType,true))

分析从本地上传的文件

scala> val rowRDD = optionRDD.map(_.split("\\|"))
rowRDD: org.apache.spark.rdd.RDD[Array[String]] = MapPartitionsRDD[3] at map at <console>:31

scala> rowRDD.foreach(x=>println(x(0)))

导包

scala> import org.apache.spark.sql._
import org.apache.spark.sql._

将文件内容分成两部分,以数组的形式

scala> val rowRDD = optionRDD.map(_.split("\\|")).map(attributes=>Row(attributes(0).trim(),attributes(1).trim()))
rowRDD: org.apache.spark.rdd.RDD[org.apache.spark.sql.Row] = MapPartitionsRDD[5] at map at <console>:34

创建DF

scala> val opDF = spark.createDataFrame(rowRDD,schema)
opDF: org.apache.spark.sql.DataFrame = [time: string, content: string]

scala> opDF.printSchema
root
 |-- time: string (nullable = true)
 |-- content: string (nullable = true)

scala> opDF.show(1,false)

在这里插入图片描述

获取第一层content

scala> val opDF1 = opDF.select($"time",get_json_object($"content","$.cm").alias("cm"),get_json_object($"content","$.ap").alias("ap"),get_json_object($"content","$.et").alias("et"))
opDF1: org.apache.spark.sql.DataFrame = [time: string, cm: string ... 2 more fields]

scala> opDF1.printSchema
root
 |-- time: string (nullable = true)
 |-- cm: string (nullable = true)
 |-- ap: string (nullable = true)
 |-- et: string (nullable = true)

scala> opDF1.show(1,false)
|time|cm|ap |et 
|1593136280643|{"ln":"-87.7","sv":"V2.1.8","os":"8.0.3","g":"W51WU5I5@gmail.com","mid":"0","nw":"3G","l":"pt","vc":"16","hw":"640*1136","ar":"MX","uid":"0","t":"1593072938707","la":"29.6","md":"Huawei-8","vn":"1.2.3","ba":"Huawei","sr":"I"}|app|[{"ett":"1593051532834","en":"display","kv":{"goodsid":"0","action":"1","extend1":"1","place":"1","category":"71"}}]

获取第二层

scala> val opDF2 = opDF1.select($"time",$"ap",get_json_object($"cm","$.ln").alias("ln"),get_json_object($"cm","$.sv").alias("sv"),get_json_object($"cm","$.os").alias("os"),get_json_object($"cm","$.g").alias("g"),get_json_object($"cm","$.mid").alias("mid"),get_json_object($"cm","$.nw").alias("nw"),get_json_object($"cm","$.l").alias("l"),get_json_object($"cm","$.vc").alias("vc"),get_json_object($"cm","$.hw").alias("hw"),get_json_object($"cm","$.ar").alias("ar"),get_json_object($"cm","$.uid").alias("uid"),get_json_object($"cm","$.t").alias("t"),get_json_object($"cm","$.la").alias("la"),get_json_object($"cm","$.md").alias("md"),get_json_object($"cm","$.vn").alias("vn"),get_json_object($"cm","$.ba").alias("ba"),get_json_object($"cm","$.sr").alias("sr"),from_json($"et",ArrayType(StructType(StructField("ett",StringType)::StructField("en",StringType)::StructField("kv",StringType)::Nil))).as("events"))
opDF2: org.apache.spark.sql.DataFrame = [time: string, ap: string ... 18 more fields]

scala> opDF2.printSchema
root
 |-- time: string (nullable = true)
 |-- ap: string (nullable = true)
 |-- ln: string (nullable = true)
 |-- sv: string (nullable = true)
 |-- os: string (nullable = true)
 |-- g: string (nullable = true)
 |-- mid: string (nullable = true)
 |-- nw: string (nullable = true)
 |-- l: string (nullable = true)
 |-- vc: string (nullable = true)
 |-- hw: string (nullable = true)
 |-- ar: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- t: string (nullable = true)
 |-- la: string (nullable = true)
 |-- md: string (nullable = true)
 |-- vn: string (nullable = true)
 |-- ba: string (nullable = true)
 |-- sr: string (nullable = true)
 |-- events: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- ett: string (nullable = true)
 |    |    |-- en: string (nullable = true)
 |    |    |-- kv: string (nullable = true)

将event进行explode

scala> val opDF3 = opDF2.select($"time",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",explode($"events").as("eventcontent"))
opDF3: org.apache.spark.sql.DataFrame = [time: string, ap: string ... 18 more fields]


scala> opDF3.printSchema
root
 |-- time: string (nullable = true)
 |-- ap: string (nullable = true)
 |-- ln: string (nullable = true)
 |-- sv: string (nullable = true)
 |-- os: string (nullable = true)
 |-- g: string (nullable = true)
 |-- mid: string (nullable = true)
 |-- nw: string (nullable = true)
 |-- l: string (nullable = true)
 |-- vc: string (nullable = true)
 |-- hw: string (nullable = true)
 |-- ar: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- t: string (nullable = true)
 |-- la: string (nullable = true)
 |-- md: string (nullable = true)
 |-- vn: string (nullable = true)
 |-- ba: string (nullable = true)
 |-- sr: string (nullable = true)
 |-- eventcontent: struct (nullable = true)
 |    |-- ett: string (nullable = true)
 |    |-- en: string (nullable = true)
 |    |-- kv: string (nullable = true)

scala> opDF3.select($"time",$"sr",$"eventcontent").where("time=1593137251934").show(10,false)
+-------------+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|time         |sr |eventcontent                                                                                                                                                                                                                                                                                                                                       |
+-------------+---+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|1593137251934|R  |[1593041825601, display, {"goodsid":"0","action":"1","extend1":"2","place":"4","category":"28"}]                                                                                                                                                                                                                                                   |
|1593137251934|R  |[1593074010012, notification, {"ap_time":"1593039359797","action":"4","type":"1","content":""}]                                                                                                                                                                                                                                                    |
|1593137251934|R  |[1593110124137, error, {"errorDetail":"at cn.lift.dfdfdf.control.CommandUtil.getInfo(CommandUtil.java:67)\\n at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\\n at java.lang.reflect.Method.invoke(Method.java:606)\\n","errorBrief":"at cn.lift.appIn.control.CommandUtil.getInfo(CommandUtil.java:67)"}]|
|1593137251934|R  |[1593042153320, favorites, {"course_id":2,"id":0,"add_time":"1593047184972","userid":8}]                                                                                                                                                                                                                                                           |
|1593137251934|R  |[1593068864495, praise, {"target_id":7,"id":3,"type":3,"add_time":"1593101957239","userid":3}] 

获取第三层,即event

scala> val opDF4 = opDF3.select($"time",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"eventcontent.ett",$"eventcontent.en",$"eventcontent.kv")
opDF4: org.apache.spark.sql.DataFrame = [time: string, ap: string ... 20 more fields]


scala> opDF4.printSchema
root
 |-- time: string (nullable = true)
 |-- ap: string (nullable = true)
 |-- ln: string (nullable = true)
 |-- sv: string (nullable = true)
 |-- os: string (nullable = true)
 |-- g: string (nullable = true)
 |-- mid: string (nullable = true)
 |-- nw: string (nullable = true)
 |-- l: string (nullable = true)
 |-- vc: string (nullable = true)
 |-- hw: string (nullable = true)
 |-- ar: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- t: string (nullable = true)
 |-- la: string (nullable = true)
 |-- md: string (nullable = true)
 |-- vn: string (nullable = true)
 |-- ba: string (nullable = true)
 |-- sr: string (nullable = true)
 |-- ett: string (nullable = true)
 |-- en: string (nullable = true)
 |-- kv: string (nullable = true)


scala> opDF4.select($"time",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en").where("en='display'").show(10,false)
+-------------+---+------+------+-----+------------------+---+----+---+---+---------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+
|time         |ap |ln    |sv    |os   |g                 |mid|nw  |l  |vc |hw       |ar |uid|t            |la   |md        |vn   |ba     |sr |ett          |en     |
+-------------+---+------+------+-----+------------------+---+----+---+---+---------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+
|1593137251934|app|-48.8 |V2.1.2|8.1.7|9T5WG2IF@gmail.com|0  |WIFI|en |4  |750*1134 |MX |0  |1593054455815|-13.8|Huawei-0  |1.0.5|Huawei |R  |1593041825601|display|
|1593137251952|app|-94.9 |V2.7.0|8.2.7|2S3H1L7K@gmail.com|4  |4G  |en |6  |750*1134 |MX |4  |1593119037940|-46.8|sumsung-1 |1.0.6|Sumsung|I  |1593058474741|display|
|1593137251954|app|-69.0 |V2.4.8|8.0.3|85B866BI@gmail.com|8  |3G  |es |2  |640*960  |MX |8  |1593083277982|22.4 |sumsung-13|1.0.3|Sumsung|S  |1593053303645|display|
|1593137251958|app|-97.9 |V2.3.0|8.0.6|59V555IC@gmail.com|14 |3G  |es |5  |640*1136 |MX |14 |1593105762133|-45.6|Huawei-16 |1.1.5|Huawei |L  |1593098492648|display|
|1593137251962|app|-50.7 |V2.6.6|8.1.9|7023Q381@gmail.com|15 |WIFI|es |16 |1080*1920|MX |15 |1593096659201|-47.8|sumsung-11|1.2.7|Sumsung|G  |1593041348870|display|
|1593137251963|app|-77.1 |V2.9.8|8.2.1|W45Y7IZ9@gmail.com|17 |4G  |pt |8  |1080*1920|MX |17 |1593127628700|2.8  |sumsung-10|1.3.0|Sumsung|K  |1593070583966|display|
|1593137251964|app|-53.1 |V2.1.3|8.2.3|44768OU6@gmail.com|18 |3G  |en |9  |640*1136 |MX |18 |1593114312736|-11.1|HTC-1     |1.3.9|HTC    |Z  |1593110468000|display|
|1593137251966|app|-88.1 |V2.2.6|8.1.5|A4QF5V3G@gmail.com|27 |WIFI|pt |1  |750*1134 |MX |27 |1593049555889|18.8 |sumsung-8 |1.0.6|Sumsung|N  |1593083023112|display|
|1593137251967|app|-104.9|V2.7.1|8.2.2|1L1PF21L@gmail.com|28 |WIFI|pt |13 |750*1134 |MX |28 |1593123010449|9.8  |Huawei-13 |1.3.4|Huawei |F  |1593072640137|display|
|1593137251968|app|-105.1|V2.7.1|8.0.4|P5OB863W@gmail.com|29 |4G  |es |13 |640*960  |MX |29 |1593132148433|-20.0|Huawei-9  |1.0.9|Huawei |C  |1593052346863|display|
+-------------+---+------+------+-----+------------------+---+----+---+---+---------+---+---+-------------+-----+----------+-----+-------+---+-------------+-------+

获取最后一层et.en不同值时et.kv
由于每个kv的值不一样,所以根据kv.en的值不同进行分类
由最开始的数据结构图可知,有七种类型,在此给出两种案列

当en=display时,DF如下:

scala> val opDisplayDF =  opDF4.select($"time",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en",get_json_object($"kv","$.goodsid").alias("goodsid"),get_json_object($"kv","$.action").alias("action"),get_json_object($"kv","$.extend1").alias("extend1"),get_json_object($"kv","$.place").alias("place"),get_json_object($"kv","$.category").alias("category")).where("en='display'")
opDisplayDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [time: string, ap: string ... 24 more fields]


scala> opDisplayDF.printSchema
root
 |-- time: string (nullable = true)
 |-- ap: string (nullable = true)
 |-- ln: string (nullable = true)
 |-- sv: string (nullable = true)
 |-- os: string (nullable = true)
 |-- g: string (nullable = true)
 |-- mid: string (nullable = true)
 |-- nw: string (nullable = true)
 |-- l: string (nullable = true)
 |-- vc: string (nullable = true)
 |-- hw: string (nullable = true)
 |-- ar: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- t: string (nullable = true)
 |-- la: string (nullable = true)
 |-- md: string (nullable = true)
 |-- vn: string (nullable = true)
 |-- ba: string (nullable = true)
 |-- sr: string (nullable = true)
 |-- ett: string (nullable = true)
 |-- en: string (nullable = true)
 |-- goodsid: string (nullable = true)
 |-- action: string (nullable = true)
 |-- extend1: string (nullable = true)
 |-- place: string (nullable = true)
 |-- category: string (nullable = true)


scala> opDisplayDF.show(2,false)
2020-06-29 22:55:00 WARN  Utils:66 - Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.debug.maxToStringFields' in SparkEnv.conf.
+-------------+---+-----+------+-----+------------------+---+----+---+---+--------+---+---+-------------+-----+---------+-----+-------+---+-------------+-------+-------+------+-------+-----+--------+
|time         |ap |ln   |sv    |os   |g                 |mid|nw  |l  |vc |hw      |ar |uid|t            |la   |md       |vn   |ba     |sr |ett          |en     |goodsid|action|extend1|place|category|
+-------------+---+-----+------+-----+------------------+---+----+---+---+--------+---+---+-------------+-----+---------+-----+-------+---+-------------+-------+-------+------+-------+-----+--------+
|1593137251934|app|-48.8|V2.1.2|8.1.7|9T5WG2IF@gmail.com|0  |WIFI|en |4  |750*1134|MX |0  |1593054455815|-13.8|Huawei-0 |1.0.5|Huawei |R  |1593041825601|display|0      |1     |2      |4    |28      |
|1593137251952|app|-94.9|V2.7.0|8.2.7|2S3H1L7K@gmail.com|4  |4G  |en |6  |750*1134|MX |4  |1593119037940|-46.8|sumsung-1|1.0.6|Sumsung|I  |1593058474741|display|1      |2     |1      |1    |44      |
+-------------+---+-----+------+-----+------------------+---+----+---+---+--------+---+---+-------------+-----+---------+-----+-------+---+-------------+-------+-------+------+-------+-----+--------+

当en=active_background时,DF如下:

scala> val opActionBackgroundDF =  opDF4.select($"time",$"ap",$"ln",$"sv",$"os",$"g",$"mid",$"nw",$"l",$"vc",$"hw",$"ar",$"uid",$"t",$"la",$"md",$"vn",$"ba",$"sr",$"ett",$"en", get_json_object($"kv","$.active_source").alias("activeSource")).where("en='active_background'")
opActionBackgroundDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [time: string, ap: string ... 20 more fields]

scala> opActionBackgroundDF.printSchema
root
 |-- time: string (nullable = true)
 |-- ap: string (nullable = true)
 |-- ln: string (nullable = true)
 |-- sv: string (nullable = true)
 |-- os: string (nullable = true)
 |-- g: string (nullable = true)
 |-- mid: string (nullable = true)
 |-- nw: string (nullable = true)
 |-- l: string (nullable = true)
 |-- vc: string (nullable = true)
 |-- hw: string (nullable = true)
 |-- ar: string (nullable = true)
 |-- uid: string (nullable = true)
 |-- t: string (nullable = true)
 |-- la: string (nullable = true)
 |-- md: string (nullable = true)
 |-- vn: string (nullable = true)
 |-- ba: string (nullable = true)
 |-- sr: string (nullable = true)
 |-- ett: string (nullable = true)
 |-- en: string (nullable = true)
 |-- activeSource: string (nullable = true)


scala> opActionBackgroundDF.show(1,false)
+-------------+---+-----+------+-----+------------------+---+----+---+---+---------+---+---+-------------+-----+--------+-----+------+---+-------------+-----------------+------------+
|time         |ap |ln   |sv    |os   |g                 |mid|nw  |l  |vc |hw       |ar |uid|t            |la   |md      |vn   |ba    |sr |ett          |en               |activeSource|
+-------------+---+-----+------+-----+------------------+---+----+---+---+---------+---+---+-------------+-----+--------+-----+------+---+-------------+-----------------+------------+
|1593137251948|app|-37.9|V2.7.1|8.0.0|4QALOI53@gmail.com|3  |WIFI|en |18 |1080*1920|MX |3  |1593084695545|-36.1|Huawei-2|1.1.0|Huawei|P  |1593084261117|active_background|2           |
+-------------+---+-----+------+-----+------------------+---+----+---+---+---------+---+---+-------------+-----+--------+-----+------+---+-------------+-----------------+------------+

  • 2
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值