Apache日志处理
解析Apache日志得到有用字段
import scala.util.matching.Regex
case class ApacheAcessLog (
IpAdress:String,
ClientId:String,
UserId:String,
ServerDate:String,
Response:String,
ResponseCode:String,
ResponseDataSize:String,
Referer:String,
UserAgent:String
)
object ApacheAcessLog {
val Parttern:Regex="""^(\S+)\s(\S+)\s(\S+)\s(\[[^\[\]]+\])\s"([A-Z]+\s\S+\s\S+)"\s(\d{3})\s(\d+|-)\s"(\S+)"\s(".+")$""" .r
def CheckLogData(line:String):Boolean={
if (line.length>500 ){false }else {
val options = Parttern.findFirstMatchIn(line)
if (options.isEmpty) {
false
} else {
true
}
}
}
/**
* 解析输入的日志数据
*/
def ParseApacheLog(line:String):ApacheAcessLog={
val options=parttern.findFirstMatchIn(line)
val matcher=options.get
ApacheAcessLog(
matcher.group(1 ),
matcher.group(2 ),
matcher.group(3 ),
matcher.group(4 ),
matcher.group(5 ),
matcher.group(6 ),
matcher.group(7 ),
matcher.group(8 ),
matcher.group(9 )
)
}
import org.apache .spark .rdd .RDD
import org.apache .spark .sql
import org.apache .spark .{SparkConf, SparkContext}
object SparkLogAnalyze {
def main(args: Array[String]): Unit = {
val conf=new SparkConf()
.setAppName ("ApacheLogAnalyze" )
.setMaster ("local[4]" )
val sc=SparkContext.getOrCreate (conf)
val path="H:\\ApacheLogALL.log"
val rdd=sc.textFile (path)
//rdd转换
val apacheAcessLog:RDD[ApacheAcessLog]=rdd
//过滤数据
.filter (line=> ApacheAcessLog.CheckLogData (line))
//对rdd进行转换
.map (line=>{
ApacheAcessLog.ParseApacheLog (line)
})
apacheAcessLog.cache () //到这里就得到了正则表达式中要求的日志字段
针对上一步提取的字段做个性化的处理
val df:RDD[Array [String ]]=apacheAcessLog
.map(log => Array (log .ServerDate,log .Referer,log .UserAgent))
df.cache()
处理Apache日志中的时间字段,将Apache中的时间格式转换成yyyy-MM-dd HH:mm:ss格式
def ApacheTimeALZ(time:String):String={
val A=time.split("\\[" )
val A1=A.apply(1 )
val B=A1.split("]" )
val viewtime=B.apply(0 )
import java.text.SimpleDateFormat
import java.util.Locale
val sdf = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z" , Locale.US)
val sdf2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss" )
val UStime=sdf.parse(viewtime)
val CNtime=sdf2.format(UStime)
CNtime
}
针对转换的时间格式,将日志数据按日期和小时分类,处理的方法是对上面处理的时间标准格式利用正则提取日期和小时字段,根据日期和小时字段做分组,达到分类。
def DealDate (serverdate:String) : String={
val part:Regex="""(\d{4}-\d{1,2}-\d{1,2})\s(\d{2})""" .r
val matcher=part.findFirstMatchIn(serverdate)
if (matcher.isEmpty){
return "0000-00-00 00:00:00"
}
matcher.get.group(0 )
}
logrdd.groupBy(line=>line.ServerDate).foreach{category=>
IoOperation.LogShow(category._1,category._2)