Spark日志分析案例

最新推荐文章于 2024-07-26 14:59:42 发布

雪域枫蓝

最新推荐文章于 2024-07-26 14:59:42 发布

阅读量5.8k

点赞数 2

分类专栏： Spark 文章标签： spark

本文链接：https://blog.csdn.net/Gerry199102/article/details/51938879

版权

Spark 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

SparkCore日志分析主程序

package com.ibeifeng.bigdata.spark.app.core
 import org.apache.spark.{SparkContext, SparkConf}
/**
 * Created by XuanYu on 2016/7/11.
 */
object LogAnalyzer {
  def main(args: Array[String]) {
    // step 1: Create SparkConf Object
    val conf = new SparkConf()
      .setAppName("LogAnalyzer Application")
      .setMaster("local[2]")
    // step 2: Create SparkContext Object
    val sc = new SparkContext(conf)
/** ====================================================================== */
    val logFile = "file:///D:/access_log"//"hdfs://hadoop-senior01.ibeifeng.com:8020/user/beifeng/apache.access.log"
    // 1: input
    val accessLogs = sc.textFile(logFile)
      /**
       *filter log data
       */
      .filter(ApacheAccessLog.isValidateLogLine)
      /**
        parse log
       */
      .map(line => ApacheAccessLog.parseLogLine(line))

    /**
     *cache ,如果某个RDD反复的被使用，可以考虑将其进行cache
     */
    accessLogs.cache()
    println("Access Logs Count : " + accessLogs.count())
    // =====================================================================
    /**
     *The average, min, and max content size of responses returned from the server.
     */
    val contentSiezes = accessLogs.map(log => log.contentSize) // RDD[String]
    /**
     *cache contentSizes
     */
    contentSiezes.cache()
    // compute
    val avgContentSize = contentSiezes.reduce(_ + _) / contentSiezes.count()
    val minContentSize = contentSiezes.min()
    val maxContextSize = contentSiezes.max()
    // unpersist
    contentSiezes.unpersist()
    println("Content Size Avg: %s, Min: %s , Max: %s".format(
      avgContentSize, minContentSize, maxContextSize
    ))
    // =====================================================================
    /**
      A count of response code's returned.
     */
    val responseCodeToCount = accessLogs
        //
        .map(log => (log.responseCode, 1))
        //
        .reduceByKey(_ + _)
        //
        .take(5)
    println(s"""Response Code Count: ${responseCodeToCount.mkString("[",",","]")}""")

    // =====================================================================
    /**
     *All IPAddresses that have accessed this server more than N times.
     */
    val ipAddresses = accessLogs
        //
        .map(log => (log.ipAddress, 1))
        //
        .reduceByKey(_ + _)
        //
//        .filter(tuple => (tuple._2 > 10 ))
        //
        .map(tuple => tuple._1)
        //
        .take(3)
    println(s"""IPAddress : \${ipAddresses.mkString("[",",","]")}""")
    // =====================================================================
    /**
     *The top endpoints requested by count.
     */
    val topEndpoints = accessLogs
        //
        .map(log => (log.endPoint, 1))
        //
        .reduceByKey(_ + _)
        // top
        //  def top(num : scala.Int)(implicit ord : scala.Ordering[T])
        .top(3)(OrderingUtils.SecondValueOrdering)
/**
        //
        .map(tuple => (tuple._2, tuple._1))
        .sortByKey(false)
        .take(3)
        .map(tuple => (tuple._2, tuple._1))
*/
    println(s"""Top Endpoints : ${topEndpoints.mkString("[",",","]")}""")
    // unpersist
    accessLogs.unpersist()
/** ====================================================================== */
    // step 3: Stop SparkContext
    sc.stop()
  }
}

2、日志分析数据清洗类

package com.ibeifeng.bigdata.spark.app.core

/**
 * Created by XuanYu on 2016/7/11.
 */
case class ApacheAccessLog(
  ipAddress: String ,
  clientIdentd: String ,
  userId: String ,
  dataTime: String ,
  method: String ,
  endPoint: String ,
  protocol: String ,
  responseCode: Int ,
  contentSize: Long
)

object ApacheAccessLog{

  // regex
  // 1.1.1.1 - - [21/Jul/2014:10:00:00 -0800] "GET /chapter1/java/src/main/java/com/databricks/apps/logs/LogAnalyzer.java HTTP/1.1" 200 1234
  val PARTTERN = """^(\S+) (\S+) (\S+) \[([\w:/]+\s[+\-]\d{4})\] "(\S+) (\S+) (\S+)" (\d{3}) (\d+)""".r

  /**
   *
   * @param log
   * @return
   */
  def isValidateLogLine(log: String): Boolean = {
    // parse log info
    val res = PARTTERN.findFirstMatchIn(log)

    // invalidate
    if(res.isEmpty){
      false
    }else{
      true
    }
  }
  /**
   *
   * @param log
   * @return
   */
  def parseLogLine(log: String): ApacheAccessLog = {
    // parse log info
    val res = PARTTERN.findFirstMatchIn(log)

    // invalidate
    if(res.isEmpty){
      throw new RuntimeException("Cannot parse log line: " + log)
    }

    // get value
    val m = res.get

    ApacheAccessLog(//
      m.group(1),m.group(2),m.group(3),//
      m.group(4),m.group(5),m.group(6),//
      m.group(7),//
      m.group(8).toInt, //
      m.group(9).toLong)
  }
}

3、自定义排序

package com.ibeifeng.bigdata.spark.app.core
/**
 * Created by XuanYu on 2016/7/11.
 */
object OrderingUtils {
  object SecondValueOrdering extends scala.Ordering[(String, Int)]{
    /**
     *
     * @param x
     * @param y
     * @return
     */
    override def compare(x: (String, Int), y: (String, Int)): Int = {
      x._2.compare(y._2)
    }
  }
}

4、SparkSQL测试案例分析

package com.ibeifeng.bigdata.spark.app.sql
import org.apache.spark.sql.SQLContext
import org.apache.spark.{SparkConf, SparkContext}
/**
 * Created by q on 2016/7/17.
 */
object  SQLApplication {
  def main(args: Array[String]) {
    val conf = new SparkConf()
      .setAppName("SQLApplication")
      .setMaster("local[2]")
    val sc = new SparkContext(conf)
    val sqlContext = new SQLContext(sc)
    // this is used to implicitly convert an RDD to a DataFrame.
    /**
     * ==========================================================================================
     */
    //create DataFrame
    val df = sqlContext.read.load("/user/ibeifeng/sparklogexample/users.parquet")
    df.show()
    /**
     * ==========================================================================================
     */
    sc.stop()
  }
}