Spark对数据日志进行清洗

最新推荐文章于 2022-04-14 14:30:43 发布

jokertiger

最新推荐文章于 2022-04-14 14:30:43 发布

阅读量686

点赞数

本文链接：https://blog.csdn.net/jokertiger/article/details/116988129

版权

样例数据(取一行）

2018-09-04T20:27:31+08:00	http://datacenter.bdqn.cn/logs/user?actionBegin=1536150451540&actionClient=Mozilla%2F5.0+%28Windows+NT+10.0%3B+WOW64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F58.0.3029.110+Safari%2F537.36+SE+2.X+MetaSr+1.0&actionEnd=1536150451668&actionName=startEval&actionTest=0&actionType=3&actionValue=272090&clientType=001_kgc&examType=001&ifEquipment=web&isFromContinue=false&skillIdCount=0&skillLevel=0&testType=jineng&userSID=B842B843AE317425D53D0C567A903EF7.exam-tomcat-node3.exam-tomcat-node3&userUID=272090&userUIP=1.180.18.157	GET	200	192.168.168.64	-	-	Apache-HttpClient/4.1.2 (java 1.5)

具体实现如下

package kb11.ed

import java.util.Properties

import org.apache.commons.lang.StringUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
object EtlDemo {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("etldemo")
    val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
    val sc: SparkContext = spark.sparkContext

    import spark.implicits._
   // 加载日志数据，按照\t分割，过滤出长度为8的数据，将数据封装到Row中
 val rowRdd: RDD[Row] = sc.textFile("inpath/test.log").map(x => x.split("\t"))
      .filter(_.length == 8).map(x => Row(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7)))
      
      val long_schame: StructType = StructType(Array(
      StructField("event_time", StringType),
      StructField("url", StringType),
      StructField("method", StringType),
      StructField("status", StringType),
      StructField("sip", StringType),
      StructField("user_uip", StringType),
      StructField("action_prepend", StringType),
      StructField("action_client", StringType)
    ))
    val logDF: DataFrame = spark.createDataFrame(rowRdd,long_schame)
    
//去除重复的event_time,url列，过滤出状态为200的数据，除去event_time为空的数据
	val fiterLogs: Dataset[Row] = logDF.dropDuplicates("event_time", "url")
      .filter(x => x(3) == "200")
      .filter(x => StringUtils.isNotEmpty(x(0).toString))
val full_logs_RDD: RDD[Row] = fiterLogs.map(line => {
      val str = line.getAs[String]("url")
      val paramsArray= str.split("\\?")
      var paramsMap: Map[String, String] = null
      if (paramsArray.length == 2) {
        val strings = paramsArray(1).split("&")
        paramsMap = strings.map(x => x.split("=")).filter(x => x.length == 2)
          .map(x => (x(0), x(1))).toMap
      }
      ( //最好不要超过22个，如果非要那么多，建议使用元组中套元组
        line.getAs[String]("event_time"),
        paramsMap.getOrElse[String]("userUID", ""),
        paramsMap.getOrElse[String]("userSID", ""),
        paramsMap.getOrElse[String]("actionBegin", ""),
        paramsMap.getOrElse[String]("actionEnd", ""),
        paramsMap.getOrElse[String]("actionType", ""),
        paramsMap.getOrElse[String]("actionName", ""),
        paramsMap.getOrElse[String]("actionValue", ""),
        paramsMap.getOrElse[String]("actionTest", ""),
        paramsMap.getOrElse[String]("ifEquipment", ""),
        line.getAs[String]("method"),
        line.getAs[String]("status"),
        line.getAs[String]("sip"),
        line.getAs[String]("user_uip"),
        line.getAs[String]("action_prepend"),
        line.getAs[String]("action_client")
      )
    }).toDF().rdd
    
	val full_logs_schema = StructType(
      Array(
        StructField("event_time",StringType),

        StructField("userUID",StringType),
        StructField("userSID",StringType),
        StructField("actionBegin",StringType),
        StructField("actionEnd",StringType),
        StructField("actionType",StringType),
        StructField("actionName",StringType),
        StructField("actionValue",StringType),
        StructField("actionTest",StringType),
        StructField("ifEquipment",StringType),

        StructField("method",StringType),
        StructField("status",StringType),
        StructField("sip",StringType),
        StructField("user_uip",StringType),
        StructField("action_prepend",StringType),
        StructField("action_client",StringType)
      )
    )
    val full_logDF = spark.createDataFrame(full_logs_RDD,full_logs_schema)
    full_logDF.printSchema()
    full_logDF.show(10)
    //连接Mysql，将数据存入其中
    val url = "jdbc:mysql://192.168.119.125:3306/etl"
    val user = "root"
    val password = "ok"
    val driver = "com.mysql.jdbc.Driver"

    val properties = new Properties()
    properties.setProperty("url",url)
    properties.setProperty("user",user)
    properties.setProperty("password",password)
    properties.setProperty("driver",driver)
    //写入Mysql
    println("将filterlogs写入Mysql")

    fiterLogs.write.mode("Overwrite").jdbc(url,"access_logs",properties)

    println("写入完成")

    println("写入full_logDF到数据库")
    full_logDF.write.mode(SaveMode.Append).jdbc(jdbcUntils.url,jdbcUntils.table_full_access_log,properties)
    println("写入full_logDF到数据库完成")
    spark.stop()

jokertiger

关注

0
点赞
踩
2

收藏

觉得还不错? 一键收藏
0
评论
Spark对数据日志进行清洗

样例数据(取一行）2018-09-04T20:27:31+08:00 http://datacenter.bdqn.cn/logs/user?actionBegin=1536150451540&actionClient=Mozilla%2F5.0+%28Windows+NT+10.0%3B+WOW64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F58.0.3029.110+Safari%2F537.36+SE+2.X+Met
复制链接

扫一扫