Spark 处理log日志

Spark 处理log日志(数据清洗)

源数据准备

准备一个 .log 格式的文件,内容如下所示
在这里插入图片描述

数据清洗要求

在这里插入图片描述

准备环境

读取文件后,按照格式一层层切分数据,并将最终的结果保存到 Mysql 数据库中,代码如下所示

先设置参数类,避免多次调用的代码冗余

object JdbcUtils {
  val url="jdbc:mysql://192.168.146.222:3306/etldemo"
  val user="root"
  val driver="com.mysql.jdbc.Driver"
  val pwd="1"
  val table_access_logs = "access_logs"
  val table_full_access_logs="full_access_logs"
}

开始清洗

编写清洗代码,针对上面的要求进行对应清洗

import java.util.Properties
import nj.zb.kb11.etl.url.JdbcUtils
import org.apache.commons.lang.StringUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql._

object EtlDemo {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName("innerdemo").setMaster("local[*]")
    val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
    val sc: SparkContext = spark.sparkContext

    import spark.implicits._

    //加载日志数据,按照\t 分割,过滤出长度为8的数据,将数据封装到Row中
    val rowRDD: RDD[Row] = sc.textFile("in/test.log").map(x => x.split("\t"))
      .filter(x => x.length >= 8)
      .map(x => Row(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7)))
//    rowRDD.collect().foreach(println)

    //创建Schema
    val logs_schema = StructType(
      Array(
        StructField("event_time", StringType),
        StructField("url", StringType),
        StructField("method", StringType),
        StructField("status", StringType),
        StructField("sip", StringType),
        StructField("user_uip", StringType),
        StructField("action_prepend", StringType),
        StructField("action_client", StringType)
      )
    )
   val logDF: DataFrame = spark.createDataFrame(rowRDD,logs_schema)
//    logDF.printSchema()
//    logDF.show(3)

    //去除重复的event_time,url 列,过滤出状态为200 的数据,去除event_time 为空的数据
    val filterLogs: Dataset[Row] = logDF.dropDuplicates("event_time", "url")
      .filter(x => x(3) == "200")
      .filter(x => StringUtils.isNotEmpty(x(0).toString))

    val full_logs_rdd: RDD[Row] = filterLogs.map(line => {
      val str: String = line.getAs[String]("url")
      val paramsArray: Array[String] = str.split("\\?")
      var paramsMap: Map[String, String] = null
      if (paramsArray.length == 2) {
        paramsMap = paramsArray(1)
          .split("&")
          .map(x => x.split("="))
          .filter(x => x.length == 2)
          .map(x => (x(0), x(1))).toMap
      }
      (
        line.getAs[String]("event_time"),
        paramsMap.getOrElse[String]("userUID", ""),
        paramsMap.getOrElse[String]("userSID", ""),
        paramsMap.getOrElse[String]("actionBegin", ""),
        paramsMap.getOrElse[String]("actionEnd", ""),
        paramsMap.getOrElse[String]("actionType", ""),
        paramsMap.getOrElse[String]("actionName", ""),
        paramsMap.getOrElse[String]("actionValue", ""),
        paramsMap.getOrElse[String]("actionTest", ""),
        paramsMap.getOrElse[String]("ifEquipment", ""),
        line.getAs[String]("method"),
        line.getAs[String]("status"),
        line.getAs[String]("sip"),
        line.getAs[String]("user_uip"),
        line.getAs[String]("action_prepend"),
        line.getAs[String]("action_client")

      )
    }).toDF().rdd
   val full_logs_schema =StructType(
     Array(
       StructField("event_time", StringType),
       StructField("userUID", StringType),
       StructField("userSID", StringType),
       StructField("actionBegin", StringType),
       StructField("actionEnd", StringType),
       StructField("actionType", StringType),
       StructField("actionName", StringType),
       StructField("actionValue", StringType),
       StructField("actionTest", StringType),
       StructField("ifEquipment", StringType),
       StructField("method", StringType),
       StructField("status", StringType),
       StructField("sip", StringType),
       StructField("user_uip", StringType),
       StructField("action_prepend", StringType),
       StructField("action_client", StringType)
     )
   )
    val full_logDF: DataFrame = spark.createDataFrame(full_logs_rdd,full_logs_schema)
    full_logDF.show(3,false)
    full_logDF.printSchema()

    //连接Mysql数据库

    val properties = new Properties()
    properties.setProperty("user",JdbcUtils.user)
    properties.setProperty("password",JdbcUtils.pwd)
    properties.setProperty("Driver",JdbcUtils.driver)



    println("写入filerLogs 到myusql数据库")
    filterLogs.write.mode(SaveMode.Overwrite).jdbc(JdbcUtils.url,JdbcUtils.table_access_logs,properties)
    println("写入 filterLogs 到数据库完成")

    println("写入filer_LogDF 到myusql数据库")
    full_logDF.write.mode(SaveMode.Overwrite).jdbc(JdbcUtils.url,JdbcUtils.table_full_access_logs,properties)
    println("写入 filter_LogDF  到数据库完成")

    spark.stop()


  }
}

实现需求

在完成数据清洗的基础上,针对Mysql 中的表格,按照需求进行对应操作

需求如下:
在这里插入图片描述
代码实现:

import java.text.SimpleDateFormat
import java.util.Properties
import org.apache.commons.lang.StringUtils
import nj.zb.kb11.etl.url.JdbcUtils
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SparkSession}

object Retention {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName("innerdemo").setMaster("local[*]")
    val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
    val sc: SparkContext = spark.sparkContext

    val properties = new Properties()
    properties.setProperty("user",JdbcUtils.user)
    properties.setProperty("password",JdbcUtils.pwd)
    properties.setProperty("Driver",JdbcUtils.driver)

    val logs: DataFrame = spark.read.jdbc(JdbcUtils.url,JdbcUtils.table_full_access_logs,properties)
    logs.cache()
//    logs.show(1,false)
//    logs.printSchema()
    import spark.implicits._

    //拉取信息,ActionName 为Registered 的数据
    val registered: DataFrame = logs.filter($"actionName" === "Registered")
      .withColumnRenamed("event_time", "register_time")
      .select("userUID", "register_time")
    registered.printSchema()
//    registered.show(3,false)

    //拉取信息,ActionName 为Signin 的数据
    val signin: DataFrame = logs.filter($"actionName" === "Signin")
      .withColumnRenamed("event_time", "signin_time")
      .select("userUID", "signin_time")
//      signin.show(3,false)

    val joined: DataFrame = registered.join(signin,Seq("userUID"),"left")
    joined.printSchema()
    joined.show(3,false)

    val  spdf=new SimpleDateFormat("yyyy-MM-dd")
    //注册UDF  传入参数  字符串 输出 Long 类型  的数字
    val gszh: UserDefinedFunction = spark.udf.register("gszh", (event_time: String) => {
      if (StringUtils.isEmpty(event_time))
        0
      else {
        spdf.parse(event_time).getTime
      }
    })

    val joined2: DataFrame = joined.withColumn("register_date",gszh($"register_time"))
        .withColumn("signin_date",gszh($"signin_time"))
//    joined2.show(3,false)
//    joined2.printSchema()
import org.apache.spark.sql.functions._
      //一天有  86400000 毫秒,统计出注册后次日登录的客户数量
    val result1: DataFrame = joined2.filter($"register_date" + 86400000 === $"signin_date")
      .groupBy($"register_date", $"userUID").agg(count("userUID"))
      .groupBy($"register_date").agg(count("userUID").as("num1"))
//    result1.printSchema()
//    result1.show()

    //按照时间,统计出当日注册总客户数
    registered.printSchema()
    val resultsum: DataFrame = joined2.groupBy("register_date").agg(countDistinct("userUID").as("num_sum"))


    //计算客户次日留存率    =   客户注册后次日登录数据  / 注册日的总数量  * 100%
//    val Retention: Column = concat((result1.toString().toInt/resultsum.toString().toInt)*100.toString,"%")

    result1.join(resultsum,"register_date").select($"num1"/$"num_sum"*100).show()


  }
}
  • 4
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值