数据样例
2018-09-04T20:27:31+08:00 http://datacenter.bdqn.cn/logs/user?actionBegin=1536150451540&actionClient=Mozilla%2F5.0+%28Windows+NT+10.0%3B+WOW64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F58.0.3029.110+Safari%2F537.36+SE+2.X+MetaSr+1.0&actionEnd=1536150451668&actionName=startEval&actionTest=0&actionType=3&actionValue=272090&clientType=001_kgc&examType=001&ifEquipment=web&isFromContinue=false&skillIdCount=0&skillLevel=0&testType=jineng&userSID=B842B843AE317425D53D0C567A903EF7.exam-tomcat-node3.exam-tomcat-node3&userUID=272090&userUIP=1.180.18.157 GET 200 192.168.168.64 - - Apache-HttpClient/4.1.2 (java 1.5)
数据清洗
1.读入日志文件并转化为RDD[Row]类型
按照Tab切割数据
过滤掉字段数量少于8个的
2.对数据进行清洗
按照第一列和第二列对数据进行去重
过滤掉状态码非200
过滤掉event_time为空的数据
将url按照”&”以及”=”切割
3.保存数据
将数据写入mysql表中
package cn.kgc.test01
import org.apache.commons.lang.StringUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
/**
* @author WGY
*/
object DataClean extends App {
//1、创建SparkSession对象
private val spark: SparkSession = SparkSession.builder().master("local[*]").appName("dataclean").config("spark.executor.memory", "4G").getOrCreate()
private val sc: SparkContext = spark.sparkContext
//导入隐式装换的包
import spark.implicits._
//以前读取文件的方法(绝对路径)
// val logPath="file:///D:\\IntelliJ IDEA 2018.2.2\\Project\\testscala\\spark_0814\\src\\data\\test.log"
val logPath = "spark_0814/src/data/test.log"
//(相对路径)
private val logfileRDD: RDD[Array[String]] = sc.textFile(logPath).map(_.split("\t"))
// println(logfileRDD.take(1))//测试一下能不能读到
//清理数据--过滤掉字段小于8个的数据
private val filterRDD: RDD[Array[String]] = logfileRDD.filter(_.length == 8)
//将RDD转换为DF 不是简单的toDF()就可以 一、样例类 二、schema
private val rowRDD: RDD[Row] = filterRDD.map(x => Row(x(0).trim, x(1), x(2), x(3), x(4), x(5), x(6), x(7)))
/**
* event_time
* url
* method
* status
* sip
* user_uip
* action_prepend
* action_client
*/
//通过字段名字 定义schema信息
private val schema_access = StructType(Array(
StructField("event_time", StringType, true),
StructField("url", StringType, true),
StructField("method", StringType, true),
StructField("status", StringType, true),
StructField("sip", StringType, true),
StructField("user_uip", StringType, true),
StructField("action_prepend", StringType, true),
StructField("action_client", StringType, true)
))
schema_access
//转换成DF
private val logsDF: DataFrame = spark.createDataFrame(rowRDD, schema_access)
logsDF.printSchema()//测试是否成功
logsDF.show()
/**
* 按照第一列第二列对数据去重
* 过滤状态码非200
* 过滤event_time为空的数据
* 将url按照"&"和"="切割
*/
//去重dropDuplicates("字段名",)
private val filterLogs: Dataset[Row] = logsDF.dropDuplicates("event_time", "url")
.filter(x => x(3) == "200") //过滤状态码非200的
.filter(x => StringUtils.isNotEmpty(x(0).toString)) //过滤event_time为空的数据
filterLogs.show()
//切割
//2018-09-04T20:27:31+08:00
//http://datacenter.bdqn.cn/logs/learn?ifEquipment=005
// &userSID=sodwly%40163.com
// &actionValue=23710
// &actionBegin=1536150451762
// &actionName=StartLearn
// &userUIP=192.168.168.14
// &actionType=3
// &actionClient=Mozilla%2F5.0+%28Windows+NT+10.0%3B+WOW64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F63.0.3239.26+Safari%2F537.36+Core%2F1.63.6726.400+QQBrowser%2F10.2.2265.400
// &actionEnd=1536150451766
// &userUID=719786
// &actionTest=0 GET 200 192.168.168.124 - - Apache-HttpClient/4.1.2 (java 1.5)
//
private val full_access_logs_rdd: RDD[Row] = filterLogs.map(x => {
val paramsArr = x.getAs[String]("url").split("\\?")
var paramsMap = Map("parms" -> "null")
//判断长度是否等于2
if (paramsArr.length == 2) {
paramsMap = paramsArr(1).split("&") //actionType=3
.map(_.split("=")) //actionType 3
.filter(_.length == 2) //再此判断
.map(x => (x(0), x(1))) //将元素转换成元祖
.toMap
}
//需要生成表导入mysql 需要DF 需要row类型的RDD
//以下是以元祖的方式做返回值 注意不能超过22个
(
x.getAs[String]("event_time"),
paramsMap.getOrElse[String]("userUID", ""),
paramsMap.getOrElse[String]("userSID", ""),
paramsMap.getOrElse[String]("actionBegin", ""),
paramsMap.getOrElse[String]("actionEnd", ""),
paramsMap.getOrElse[String]("actionType", ""),
paramsMap.getOrElse[String]("actionName", ""),
paramsMap.getOrElse[String]("actionValue", ""),
paramsMap.getOrElse[String]("actionTest", ""),
paramsMap.getOrElse[String]("ifEquipment", ""),
x.getAs[String]("method"),
x.getAs[String]("status"),
x.getAs[String]("sip"),
x.getAs[String]("user_uip"),
x.getAs[String]("action_prepend"),
x.getAs[String]("action_client")
)
}).toDF().rdd
full_access_logs_rdd
// full_access_logs_rdd
//定义schema
private val full_access_logs_schema = StructType(Array(
StructField("event_time", StringType, true),
StructField("user_uid", StringType, true),
StructField("user_sid", StringType, true),
StructField("action_begin", StringType, true),
StructField("action_end", StringType, true),
StructField("action_type", StringType, true),
StructField("action_name", StringType, true),
StructField("action_value", StringType, true),
StructField("action_test", StringType, true),
StructField("if_equipment", StringType, true),
StructField("method", StringType, true),
StructField("status", StringType, true),
StructField("sip", StringType, true),
StructField("user_uip", StringType, true),
StructField("action_prepend", StringType, true),
StructField("action_client", StringType, true)
))
//rowRDD和schema都有了创建DF
private val full_access_logs_DF: DataFrame = spark.createDataFrame(full_access_logs_rdd, full_access_logs_schema)
full_access_logs_DF.show()
}