样例数据(取一行)
2018-09-04T20:27:31+08:00 http://datacenter.bdqn.cn/logs/user?actionBegin=1536150451540&actionClient=Mozilla%2F5.0+%28Windows+NT+10.0%3B+WOW64%29+AppleWebKit%2F537.36+%28KHTML%2C+like+Gecko%29+Chrome%2F58.0.3029.110+Safari%2F537.36+SE+2.X+MetaSr+1.0&actionEnd=1536150451668&actionName=startEval&actionTest=0&actionType=3&actionValue=272090&clientType=001_kgc&examType=001&ifEquipment=web&isFromContinue=false&skillIdCount=0&skillLevel=0&testType=jineng&userSID=B842B843AE317425D53D0C567A903EF7.exam-tomcat-node3.exam-tomcat-node3&userUID=272090&userUIP=1.180.18.157 GET 200 192.168.168.64 - - Apache-HttpClient/4.1.2 (java 1.5)
具体实现如下
package kb11.ed
import java.util.Properties
import org.apache.commons.lang.StringUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.{StringType, StructField, StructType}
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql._
import org.apache.spark.sql.functions._
object EtlDemo {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("etldemo")
val spark: SparkSession = SparkSession.builder().config(conf).getOrCreate()
val sc: SparkContext = spark.sparkContext
import spark.implicits._
// 加载日志数据,按照\t分割,过滤出长度为8的数据,将数据封装到Row中
val rowRdd: RDD[Row] = sc.textFile("inpath/test.log").map(x => x.split("\t"))
.filter(_.length == 8).map(x => Row(x(0), x(1), x(2), x(3), x(4), x(5), x(6), x(7)))
val long_schame: StructType = StructType(Array(
StructField("event_time", StringType),
StructField("url", StringType),
StructField("method", StringType),
StructField("status", StringType),
StructField("sip", StringType),
StructField("user_uip", StringType),
StructField("action_prepend", StringType),
StructField("action_client", StringType)
))
val logDF: DataFrame = spark.createDataFrame(rowRdd,long_schame)
//去除重复的event_time,url列,过滤出状态为200的数据,除去event_time为空的数据
val fiterLogs: Dataset[Row] = logDF.dropDuplicates("event_time", "url")
.filter(x => x(3) == "200")
.filter(x => StringUtils.isNotEmpty(x(0).toString))
val full_logs_RDD: RDD[Row] = fiterLogs.map(line => {
val str = line.getAs[String]("url")
val paramsArray= str.split("\\?")
var paramsMap: Map[String, String] = null
if (paramsArray.length == 2) {
val strings = paramsArray(1).split("&")
paramsMap = strings.map(x => x.split("=")).filter(x => x.length == 2)
.map(x => (x(0), x(1))).toMap
}
( //最好不要超过22个,如果非要那么多,建议使用元组中套元组
line.getAs[String]("event_time"),
paramsMap.getOrElse[String]("userUID", ""),
paramsMap.getOrElse[String]("userSID", ""),
paramsMap.getOrElse[String]("actionBegin", ""),
paramsMap.getOrElse[String]("actionEnd", ""),
paramsMap.getOrElse[String]("actionType", ""),
paramsMap.getOrElse[String]("actionName", ""),
paramsMap.getOrElse[String]("actionValue", ""),
paramsMap.getOrElse[String]("actionTest", ""),
paramsMap.getOrElse[String]("ifEquipment", ""),
line.getAs[String]("method"),
line.getAs[String]("status"),
line.getAs[String]("sip"),
line.getAs[String]("user_uip"),
line.getAs[String]("action_prepend"),
line.getAs[String]("action_client")
)
}).toDF().rdd
val full_logs_schema = StructType(
Array(
StructField("event_time",StringType),
StructField("userUID",StringType),
StructField("userSID",StringType),
StructField("actionBegin",StringType),
StructField("actionEnd",StringType),
StructField("actionType",StringType),
StructField("actionName",StringType),
StructField("actionValue",StringType),
StructField("actionTest",StringType),
StructField("ifEquipment",StringType),
StructField("method",StringType),
StructField("status",StringType),
StructField("sip",StringType),
StructField("user_uip",StringType),
StructField("action_prepend",StringType),
StructField("action_client",StringType)
)
)
val full_logDF = spark.createDataFrame(full_logs_RDD,full_logs_schema)
full_logDF.printSchema()
full_logDF.show(10)
//连接Mysql,将数据存入其中
val url = "jdbc:mysql://192.168.119.125:3306/etl"
val user = "root"
val password = "ok"
val driver = "com.mysql.jdbc.Driver"
val properties = new Properties()
properties.setProperty("url",url)
properties.setProperty("user",user)
properties.setProperty("password",password)
properties.setProperty("driver",driver)
//写入Mysql
println("将filterlogs写入Mysql")
fiterLogs.write.mode("Overwrite").jdbc(url,"access_logs",properties)
println("写入完成")
println("写入full_logDF到数据库")
full_logDF.write.mode(SaveMode.Append).jdbc(jdbcUntils.url,jdbcUntils.table_full_access_log,properties)
println("写入full_logDF到数据库完成")
spark.stop()