Spark 处理log日志(数据清洗)
源数据准备
准备一个 .log 格式的文件,内容如下所示
数据清洗要求
准备环境
读取文件后,按照格式一层层切分数据,并将最终的结果保存到 Mysql 数据库中,代码如下所示
先设置参数类,避免多次调用的代码冗余
object JdbcUtils {
val url="jdbc:mysql://192.168.146.222:3306/etldemo"
val user="root"
val driver="com.mysql.jdbc.Driver"
val pwd="1"
val table_access_logs = "access_logs"
val table_full_access_logs="full_access_logs"
}
开始清洗
编写清洗代码,针对上面的要求进行对应清洗
import java.util.Properties
import nj.zb.kb11.etl.url.JdbcUtils
import org.apache.commons.lang.StringUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql._
object EtlDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("innerdemo").setMaster("local[*]")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
import spark.implicits._
//加载日志数据,按照\t 分割,过滤出长度为8的数据,将数据封装到Row中
val rowRDD: RDD[Row] = sc.textFile("in/test.log").map(x => x.split("\t"))
.filter(x => x.length >= 8)
.map(x => Row(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7)))
// rowRDD.collect().foreach(println)
//创建Schema
val logs_schema = StructType(
Array(
StructField("event_time", StringType),
StructField("url", StringType),
StructField("method", StringType),
StructField("status", StringType),
StructField("sip", StringType),
StructField("user_uip", StringType),
StructField("action_prepend", StringType),
StructField("action_client", StringType)
)
)
val logDF: DataFrame = spark.createDataFrame(rowRDD,logs_schema)
// logDF.printSchema()
// logDF.show(3)
//去除重复的event_time,url 列,过滤出状态为200 的数据,去除event_time 为空的数据
val filterLogs: Dataset[Row] = logDF.dropDuplicates("event_time", "url")
.filter(x => x(3) == "200")
.filter(x => StringUtils.isNotEmpty(x(0).toString))
val full_logs_rdd: RDD[Row] = filterLogs.map(line => {
val str: String = line.getAs[String]("url")
val paramsArray: Array[String] = str.split("\\?")
var paramsMap: Map[String, String] = null
if (paramsArray.length == 2) {
paramsMap = paramsArray(1)
.split("&")
.map(x => x.split("="))
.filter(x => x.length == 2)
.map(x => (x(0), x(1))).toMap
}
(
line.getAs[String]("event_time"),
paramsMap.getOrElse[String]("userUID", ""),
paramsMap.getOrElse[String]("userSID", ""),
paramsMap.getOrElse[String]("actionBegin", ""),
paramsMap.getOrElse[String]("actionEnd", ""),
paramsMap.getOrElse[String]("actionType", ""),
paramsMap.getOrElse[String]("actionName", ""),
paramsMap.getOrElse[String]("actionValue", ""),
paramsMap.getOrElse[String]("actionTest", ""),
paramsMap.getOrElse[String]("ifEquipment", ""),
line.getAs[String]("method"),
line.getAs[String]("status"),
line.getAs[String]("sip"),
line.getAs[String]("user_uip"),
line.getAs[String]("action_prepend"),
line.getAs[String]("action_client")
)
}).toDF().rdd
val full_logs_schema =StructType(
Array(
StructField("event_time", StringType),
StructField("userUID", StringType),
StructField("userSID", StringType),
StructField("actionBegin", StringType),
StructField("actionEnd", StringType),
StructField("actionType", StringType),
StructField("actionName", StringType),
StructField("actionValue", StringType),
StructField("actionTest", StringType),
StructField("ifEquipment", StringType),
StructField("method", StringType),
StructField("status", StringType),
StructField("sip", StringType),
StructField("user_uip", StringType),
StructField("action_prepend", StringType),
StructField("action_client", StringType)
)
)
val full_logDF: DataFrame = spark.createDataFrame(full_logs_rdd,full_logs_schema)
full_logDF.show(3,false)
full_logDF.printSchema()
//连接Mysql数据库
val properties = new Properties()
properties.setProperty("user",JdbcUtils.user)
properties.setProperty("password",JdbcUtils.pwd)
properties.setProperty("Driver",JdbcUtils.driver)
println("写入filerLogs 到myusql数据库")
filterLogs.write.mode(SaveMode.Overwrite).jdbc(JdbcUtils.url,JdbcUtils.table_access_logs,properties)
println("写入 filterLogs 到数据库完成")
println("写入filer_LogDF 到myusql数据库")
full_logDF.write.mode(SaveMode.Overwrite).jdbc(JdbcUtils.url,JdbcUtils.table_full_access_logs,properties)
println("写入 filter_LogDF 到数据库完成")
spark.stop()
}
}
实现需求
在完成数据清洗的基础上,针对Mysql 中的表格,按照需求进行对应操作
需求如下:
代码实现:
import java.text.SimpleDateFormat
import java.util.Properties
import org.apache.commons.lang.StringUtils
import nj.zb.kb11.etl.url.JdbcUtils
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, SparkSession}
object Retention {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("innerdemo").setMaster("local[*]")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
val properties = new Properties()
properties.setProperty("user",JdbcUtils.user)
properties.setProperty("password",JdbcUtils.pwd)
properties.setProperty("Driver",JdbcUtils.driver)
val logs: DataFrame = spark.read.jdbc(JdbcUtils.url,JdbcUtils.table_full_access_logs,properties)
logs.cache()
// logs.show(1,false)
// logs.printSchema()
import spark.implicits._
//拉取信息,ActionName 为Registered 的数据
val registered: DataFrame = logs.filter($"actionName" === "Registered")
.withColumnRenamed("event_time", "register_time")
.select("userUID", "register_time")
registered.printSchema()
// registered.show(3,false)
//拉取信息,ActionName 为Signin 的数据
val signin: DataFrame = logs.filter($"actionName" === "Signin")
.withColumnRenamed("event_time", "signin_time")
.select("userUID", "signin_time")
// signin.show(3,false)
val joined: DataFrame = registered.join(signin,Seq("userUID"),"left")
joined.printSchema()
joined.show(3,false)
val spdf=new SimpleDateFormat("yyyy-MM-dd")
//注册UDF 传入参数 字符串 输出 Long 类型 的数字
val gszh: UserDefinedFunction = spark.udf.register("gszh", (event_time: String) => {
if (StringUtils.isEmpty(event_time))
0
else {
spdf.parse(event_time).getTime
}
})
val joined2: DataFrame = joined.withColumn("register_date",gszh($"register_time"))
.withColumn("signin_date",gszh($"signin_time"))
// joined2.show(3,false)
// joined2.printSchema()
import org.apache.spark.sql.functions._
//一天有 86400000 毫秒,统计出注册后次日登录的客户数量
val result1: DataFrame = joined2.filter($"register_date" + 86400000 === $"signin_date")
.groupBy($"register_date", $"userUID").agg(count("userUID"))
.groupBy($"register_date").agg(count("userUID").as("num1"))
// result1.printSchema()
// result1.show()
//按照时间,统计出当日注册总客户数
registered.printSchema()
val resultsum: DataFrame = joined2.groupBy("register_date").agg(countDistinct("userUID").as("num_sum"))
//计算客户次日留存率 = 客户注册后次日登录数据 / 注册日的总数量 * 100%
// val Retention: Column = concat((result1.toString().toInt/resultsum.toString().toInt)*100.toString,"%")
result1.join(resultsum,"register_date").select($"num1"/$"num_sum"*100).show()
}
}