数据集下载:https://wwa.lanzous.com/iPrPVijmtle
1. 项目需求
使用Spark完成下列日志分析项目需求:
- 日志数据清洗
- 用户留存分析
- 活跃用户分析
- 活跃用户地域信息分析
- 用户浏览深度分析
用例1:数据清洗
读入日志文件并转化为RDD[Row]类型
- 按照Tab切割数据
- 过滤掉字段数量少于8个的
对数据进行清洗
- 按照第一列和第二列对数据进行去重
- 过滤掉状态码非200
- 过滤掉event_time为空的数据
- 将url按照”&”以及”=”切割
保存数据
- 将数据写入mysql表中
package projects
import java.util.Properties
import org.apache.commons.lang3.StringUtils
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
object LogClear {
case class Log(
event_time:String,
url:String,
method:String,
status:String,
sip:String,
user_uip:String,
action_prepend:String,
action_client:String
)
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName("spark")
.master("local[3]")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
import spark.implicits._
val linesRDD: RDD[String] = sc.textFile("in/test.txt")
//- 按照Tab切割数据
//- 过滤掉字段数量少于8个的
val ds: DataFrame = linesRDD.map(_.split("\t"))
.filter(_.length == 8)
.map(x => Log(
x(0).trim,
x(1).trim,
x(2).trim,
x(3).trim,
x(4).trim,
x(5).trim,
x(6).trim,
x(7).trim
)).toDF()
//- 按照第一列和第二列对数据进行去重
//- 过滤掉状态码非200
//- 过滤掉event_time为空的数据
//- 将url按照”&”以及”=”切割
val detailDF: DataFrame = ds.dropDuplicates("event_time", "url")
.filter(x => x(3) == "200")
.filter(x => StringUtils.isNotEmpty(x(0).toString))
.map(row => {
//问号需要转义
val urlArray: Array[String] = row(1).toString.split("\\?")
var map = Map[String, String]()
if (urlArray.length == 2) {
map = urlArray(1)
.split("&")
.map(_.split("="))
.filter(_.length == 2)
.map(x => (x(0), x(1)))
.toMap
}
(
row.getAs[String]("event_time"),
map.getOrElse("actionBegin", ""),
map.getOrElse("actionClient", ""),
map.getOrElse("actionEnd", ""),
map.getOrElse("actionName", ""),
map.getOrElse("actionTest", ""),
map.getOrElse("actionType", ""),
map.getOrElse("actionValue", ""),
map.getOrElse("clientType", ""),
map.getOrElse("examType", ""),
map.getOrElse("ifEquipment", ""),
map.getOrElse("isFromContinue", ""),
map.getOrElse("testType", ""),
map.getOrElse("userSID", ""),
map.getOrElse("userUID", ""),
map.getOrElse("userUIP", ""),
row.getAs[String]("method"),
row.getAs[String]("status"),
row.getAs[String]("sip"),
row.getAs[String]("user_uip"),
row.getAs[String]("action_prepend"),
row.getAs[String]("action_client")
)
}).toDF(
"event_time",
"actionBegin",
"actionClient",
"actionEnd",
"actionName",
"actionTest",
"actionType",
"actionValue",
"clientType",
"examType",
"ifEquipment",
"isFromContinue",
"testType",
"userSID",
"userUID",
"userUIP",
"method",
"status",
"sip",
"user_uip",
"action_prepend",
"action_client"
)
val url="jdbc:mysql://hadoop100:3306/spark"
val prop=new Properties()
prop.setProperty("user","root")
prop.setProperty("password","ok")
prop.setProperty("driver","com.mysql.jdbc.Driver")
println("开始写入mysql")
detailDF.write.mode("overwrite")
.jdbc(url,"UrlDetail",prop)
ds.write.mode("overwrite")
.jdbc(url,"LogDetail",prop)
println("写入mysql结束")
}
}
用例2:用户留存分析
计算用户的次日留存率
求当天新增用户总数n
求当天新增的用户ID与次日登录的用户ID的交集,得出新增用户次日登录总数m (次日留存数)
m/n*100%
package projects
import java.text.SimpleDateFormat
import java.util.Properties
import org.apache.spark.SparkContext
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql.{DataFrame, SparkSession}
object LogAnalysis {
def main(args: Array[String]): Unit = {
val spark: SparkSession = SparkSession.builder()
.appName("spark")
.master("local[3]")
.getOrCreate()
val sc: SparkContext = spark.sparkContext
import spark.implicits._
val url="jdbc:mysql://hadoop100:3306/spark"
val prop=new Properties()
prop.setProperty("user","root")
prop.setProperty("password","ok")
prop.setProperty("driver","com.mysql.jdbc.Driver")
val changeTime: UserDefinedFunction = spark.udf.register("changeTime", (x: String) => {
val time: Long = new SimpleDateFormat("yyyy-MM-dd")
.parse(x.substring(0, 10)).getTime
time
})
val urlDetail: DataFrame = spark.read.jdbc(url,"UrlDetail",prop)
//所有的注册用户信息
val registDF: DataFrame = urlDetail.filter(urlDetail("actionName") === "Registered")
.select($"userUID".as("regUID"), changeTime($"event_time").as("register_time"), $"actionName").distinct()
//所有的用户登录信息
val signinDF: DataFrame = urlDetail.filter(urlDetail("actionName") === "Signin")
.select($"userUID".as("sigUID"), changeTime($"event_time").as("signin_time"), $"actionName").distinct()
val joinDF: DataFrame = registDF.join(signinDF,registDF("regUID")===signinDF("sigUID"))
val l: Long = joinDF.filter(joinDF("signin_time") - joinDF("register_time") === 86400000).count()
val l1: Long = registDF.count()
println("日留存率为:"+(l.toDouble*100 / l1).formatted("%.4f")+ "%")
}
}
用例3:活跃用户分析
统计分析需求
- 读取数据库,统计每天的活跃用户数
- 统计规则:有看课和买课行为的用户才属于活跃用户
- 对UID进行去重
val frame: DataFrame = urlDetail.filter(urlDetail("actionName").isin("StartLearn", "BuyCourse"))
.select(changeTime($"event_time").as("event_time"), $"userUID").distinct().groupBy("event_time").count()
frame.show()