模拟日志数据格式
2018-09-04T20:27:31+08:00 http://datacenter.bdqn.cn/logs/user?actionBegin=1536150451690&actionClient=Mozilla%2F5.0+%28Windows+NT+10.0%3B+WOW64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F63.0.3239.132+Safari%2F537.36&actionEnd=1536150451783&actionName=viewQuestionAnalysis&actionTest=0&actionType=3&actionValue=261533&clientType=001_bdqn&examType=001&ifEquipment=web&questionId=35406&skillIdCount=0&userSID=E02CD8FDB92BD9A3D777B7425D4FE8A0.kgc-tiku-node1.kgc-tiku-node1&userUID=261533&userUIP=115.200.118.135 GET 200 192.168.168.79 - - Apache-HttpClient/4.1.2 (java 1.5)
event_time: 2018-09-04T20:27:31+08:00
url:http://datacenter.bdqn.cn/logs/useractionBegin=1536150451690&actionClient=Mozilla%2F5.0+%28Windows+NT+10.0%3B+WOW64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F63.0.3239.132+Safari%2F537.36&actionEnd=1536150451783&actionName=viewQuestionAnalysis&actionTest=0&actionType=3&actionValue=261533&clientType=001_bdqn&examType=001&ifEquipment=web&questionId=35406&skillIdCount=0&userSID=E02CD8FDB92BD9A3D777B7425D4FE8A0.kgc-tiku-node1.kgc-tiku-node1&userUID=261533&userUIP=115.200.118.135
method:GET
status:200
sip:192.168.168.79
user_uip:-
action_prepend:-
action_client:Apache-HttpClient/4.1.2 (java 1.5)
清洗任务
- 读入日志文件并转化为RDD[Row]类型
- 按照Tab切割数据
- 过滤掉字段数量少于8个的
- 对数据进行清洗
- 按照第一列和第二列对数据进行去重
- 过滤掉状态码非200
- 过滤掉event_time为空的数据
- 将url按照”&”以及”=”切割
- 保存数据
- 将数据写入mysql表中
任务实现(清洗)
import java.util.Properties
import org.apache.commons.lang.StringUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import scala.collection.mutable
object DataClear {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("clearDemo").master("local[*]").getOrCreate()
import spark.implicits._
val sc = spark.sparkContext
val rdd: RDD[String] = sc.textFile("in/test.log")
val rdd2: RDD[Array[String]] = rdd.map(x=>x.split("\t"))
val rdd3: RDD[Row] = rdd2.filter(x=>x.length==8).map(x=>Row(x(0).trim,x(1).trim,x(2).trim,x(3).trim,x(4).trim,x(5).trim,x(6).trim,x(7).trim))
val schema = StructType(Array(
StructField("event_time", StringType),
StructField("url", StringType),
StructField("method", StringType),
StructField("status", StringType),
StructField("sip", StringType),
StructField("user_uip", StringType),
StructField("action_prepend", StringType),
StructField("action_client", StringType)
))
val df: DataFrame = spark.createDataFrame(rdd3,schema)
println("-----------------开始清洗日志数据----------------------")
val df1_1: Dataset[Row] = df.dropDuplicates("event_time","url")
val df1_2: Dataset[Row] = df1_1.filter(x=>x(3)=="200")
val df1_3: Dataset[Row] = df1_2.filter(x=>StringUtils.isNotEmpty(x(0).toString))
val frame: DataFrame = df1_3.map(row => {
val urlArr: Array[String] = row.getAs[String]("url").split("\\?")
var map: Map[String, String] = Map("params" -> "null")
if (urlArr.length == 2) {
map = urlArr(1).split("&").map(x => x.split("=")).filter(x => x.length == 2).map(x => (x(0), x(1))).toMap
}
(
row.getAs[String]("event_time"),
map.getOrElse("actionBegin", ""),
map.getOrElse("actionClient", ""),
map.getOrElse("actionEnd", ""),
map.getOrElse("actionName", ""),
map.getOrElse("actionTest", ""),
map.getOrElse("actionType", ""),
map.getOrElse("actionValue", ""),
map.getOrElse("clientType", ""),
map.getOrElse("ifEquipment", ""),
map.getOrElse("testType", ""),
map.getOrElse("userSID", ""),
map.getOrElse("userUID", ""),
map.getOrElse("userUIP", ""),
row.getAs[String]("method"),
row.getAs[String]("status"),
row.getAs[String]("sip"),
row.getAs[String]("user_uip"),
row.getAs[String]("action_prepend"),
row.getAs[String]("action_client")
)
}).toDF()
val rddFrame: RDD[Row] = frame.rdd
val schemaDF = StructType{Array(
StructField("event_time", StringType, false),
StructField("actionBegin", StringType, false),
StructField("actionClient", StringType, false),
StructField("actionEnd", StringType, false),
StructField("actionName", StringType, false),
StructField("actionTest", StringType, false),
StructField("actionType", StringType, false),
StructField("actionValue", StringType, false),
StructField("clientType", StringType, false),
StructField("ifEquipment", StringType, false),
StructField("testType", StringType, false),
StructField("userSID", StringType, false),
StructField("userUID", StringType, false),
StructField("userUIP", StringType, false),
StructField("method", StringType, false),
StructField("status", StringType, false),
StructField("sip", StringType, false),
StructField("user_uip", StringType, false),
StructField("action_prepend", StringType, false),
StructField("action_client", StringType, false)
)}
val orgDF: DataFrame = spark.createDataFrame(rddFrame,schemaDF)
val url="jdbc:mysql://192.168.95.99:3306/sparkTest"
val prop=new Properties()
prop.setProperty("user","root")
prop.setProperty("password","root123")
prop.setProperty("driver","com.mysql.jdbc.Driver")
println("开始写入MySQL")
orgDF.write.mode("overwrite").jdbc(url,"orgDF",prop)
println("写入完成")
}
}
应用任务
- 计算用户的次日留存率
- 求当天新增用户总数n
- 求当天新增的用户ID与次日登录的用户ID的交集,得出新增用户次日登录总数m (次日留存数)
任务实现(应用)
import java.text.SimpleDateFormat
import java.util.Properties
import org.apache.spark.sql.expressions.UserDefinedFunction
import org.apache.spark.sql._
object UserAnalysis {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder().appName("clearDemo").master("local[*]").getOrCreate()
import spark.implicits._
val sc = spark.sparkContext
val url="jdbc:mysql://192.168.95.99:3306/sparkTest"
val prop=new Properties()
prop.setProperty("user","root")
prop.setProperty("password","root123")
prop.setProperty("driver","com.mysql.jdbc.Driver")
val detailDF: DataFrame = spark.read.jdbc(url,"orgDF",prop)
val changeTime: UserDefinedFunction = spark.udf.register("changeTime", (v: String) => {
val time: Long = new SimpleDateFormat("yyyy-MM-dd").parse(v.substring(0, 10)).getTime
time
})
val regUser: DataFrame = detailDF
.filter(detailDF("actionName")==="Registered")
.select($"userUID" as ("regUID"),changeTime($"event_time") as ("rt"))
val sigUser: DataFrame = detailDF
.filter(detailDF("actionName")==="Signin")
.select($"userUID" as ("sigUID"),changeTime($"event_time") as ("st"))
.distinct()
val regUserCountDay: DataFrame = regUser
.groupBy("rt")
.count()
.withColumnRenamed("count","regQt")
val regAndSig: DataFrame =regUser.join(sigUser, regUser("regUID") === sigUser("sigUID"))
.select("regUID", "rt", "st")
.filter(regUser("rt")===sigUser("st")-86400000)
.select("rt")
.groupBy("rt")
.count()
.withColumnRenamed("count","sigQt")
val avgReg: Dataset[(Long, Long, Long, Double)] = regAndSig.join(regUserCountDay, "rt")
.map(x => (x.getAs[Long]("rt"), x.getAs[Long]("sigQt"), x.getAs[Long]("regQt"), x.getAs[Long]("sigQt").toDouble / x.getAs[Long]("regQt")))
}
}