- SparkCore日志分析主程序
package com.ibeifeng.bigdata.spark.app.core
import org.apache.spark.{SparkContext, SparkConf}
/**
* Created by XuanYu on 2016/7/11.
*/
object LogAnalyzer {
def main(args: Array[String]) {
// step 1: Create SparkConf Object
val conf = new SparkConf()
.setAppName("LogAnalyzer Application")
.setMaster("local[2]")
// step 2: Create SparkContext Object
val sc = new SparkContext(conf)
/** ====================================================================== */
val logFile = "file:///D:/access_log"//"hdfs://hadoop-senior01.ibeifeng.com:8020/user/beifeng/apache.access.log"
// 1: input
val accessLogs = sc.textFile(logFile)
/**
*filter log data
*/
.filter(ApacheAccessLog.isValidateLogLine)
/**
parse log
*/
.map(line => ApacheAccessLog.parseLogLine(line))
/**
*cache ,如果某个RDD反复的被使用,可以考虑将其进行cache
*/
accessLogs.cache()
println("Access Logs Count : " + accessLogs.count())
// =====================================================================
/**
*The average, min, and max content size of responses returned from the server.
*/
val contentSiezes = accessLogs.map(log => log.contentSize) // RDD[String]
/**
*cache contentSizes
*/
contentSiezes.cache()
// compute
val avgContentSize = contentSiezes.reduce(_ + _) / contentSiezes.count()
val minContentSize = contentSiezes.min()
val maxContextSize = contentSiezes.max()
// unpersist
contentSiezes.unpersist()
println("Content Size Avg: %s, Min: %s , Max: %s".format(
avgContentSize, minContentSize, maxContextSize
))
// =====================================================================
/**
A count of response code's returned.
*/
val responseCodeToCount = accessLogs
//
.map(log => (log.responseCode, 1))
//
.reduceByKey(_ + _)
//
.take(5)
println(s"""Response Code Count: ${responseCodeToCount.mkString("[",",","]")}""")
// =====================================================================
/**
*All IPAddresses that have accessed this server more than N times.
*/
val ipAddresses = accessLogs
//
.map(log => (log.ipAddress, 1))
//
.reduceByKey(_ + _)
//
// .filter(tuple => (tuple._2 > 10 ))
//
.map(tuple => tuple._1)
//
.take(3)
println(s"""IPAddress : \${ipAddresses.mkString("[",",","]")}""")
// =====================================================================
/**
*The top endpoints requested by count.
*/
val topEndpoints = accessLogs
//
.map(log => (log.endPoint, 1))
//
.reduceByKey(_ + _)
// top
// def top(num : scala.Int)(implicit ord : scala.Ordering[T])
.top(3)(OrderingUtils.SecondValueOrdering)
/**
//
.map(tuple => (tuple._2, tuple._1))
.sortByKey(false)
.take(3)
.map(tuple => (tuple._2, tuple._1))
*/
println(s"""Top Endpoints : ${topEndpoints.mkString("[",",","]")}""")
// unpersist
accessLogs.unpersist()
/** ====================================================================== */
// step 3: Stop SparkContext
sc.stop()
}
}
2、日志分析数据清洗类
package com.ibeifeng.bigdata.spark.app.core
/**
* Created by XuanYu on 2016/7/11.
*/
case class ApacheAccessLog(
ipAddress: String ,
clientIdentd: String ,
userId: String ,
dataTime: String ,
method: String ,
endPoint: String ,
protocol: String ,
responseCode: Int ,
contentSize: Long
)
object ApacheAccessLog{
// regex
// 1.1.1.1 - - [21/Jul/2014:10:00:00 -0800] "GET /chapter1/java/src/main/java/com/databricks/apps/logs/LogAnalyzer.java HTTP/1.1" 200 1234
val PARTTERN = """^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r
/**
*
* @param log
* @return
*/
def isValidateLogLine(log: String): Boolean = {
// parse log info
val res = PARTTERN.findFirstMatchIn(log)
// invalidate
if(res.isEmpty){
false
}else{
true
}
}
/**
*
* @param log
* @return
*/
def parseLogLine(log: String): ApacheAccessLog = {
// parse log info
val res = PARTTERN.findFirstMatchIn(log)
// invalidate
if(res.isEmpty){
throw new RuntimeException("Cannot parse log line: " + log)
}
// get value
val m = res.get
ApacheAccessLog(//
m.group(1),m.group(2),m.group(3),//
m.group(4),m.group(5),m.group(6),//
m.group(7),//
m.group(8).toInt, //
m.group(9).toLong)
}
}
3、自定义排序
package com.ibeifeng.bigdata.spark.app.core
/**
* Created by XuanYu on 2016/7/11.
*/
object OrderingUtils {
object SecondValueOrdering extends scala.Ordering[(String, Int)]{
/**
*
* @param x
* @param y
* @return
*/
override def compare(x: (String, Int), y: (String, Int)): Int = {
x._2.compare(y._2)
}
}
}
4、SparkSQL测试案例分析
package com.ibeifeng.bigdata.spark.app.sql
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
/**
* Created by q on 2016/7/17.
*/
object SQLApplication {
def main(args: Array[String]) {
val conf = new SparkConf()
.setAppName("SQLApplication")
.setMaster("local[2]")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
// this is used to implicitly convert an RDD to a DataFrame.
/**
* ==========================================================================================
*/
//create DataFrame
val df = sqlContext.read.load("/user/ibeifeng/sparklogexample/users.parquet")
df.show()
/**
* ==========================================================================================
*/
sc.stop()
}
}