pv统计

4 篇文章 0 订阅

数据格式:取三行样本数据

50.116.27.194 - - [18/Sep/2013:07:11:29 +0000] "POST /wp-cron.php?doing_wp_cron=1379488288.8893849849700927734375 HTTP/1.0" 200 0 "-" "WordPress/3.6; http://itunic.com"
50.116.27.194 - - [17/Sep/2013:07:11:29 +0000] "POST /wp-cron.php?doing_wp_cron=1379488288.8893849849700927734375 HTTP/1.0" 200 0 "-" "WordPress/3.6; http://itunic.com"
50.116.27.194 - - [17/Sep/2013:07:11:29 +0000] "POST /wp-cron.php?doing_wp_cron=1379488288.8893849849700927734375 HTTP/1.0" 200 0 "-" "WordPress/3.6; http://itunic.com"

代码:

import java.text.SimpleDateFormat
import java.util.{Date, Locale}

import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable

object NgixLogPV{

  /**
    * 定义需要过滤的页面后缀名
    */
  val pages = new mutable.HashSet[String]()
  pages.add(".jsp")
  pages.add(".html")
  pages +=".php"

  /**
    * 来访垟利
    * @param remote_addr 来访ip
    * @param time_local 来访时间
    * @param request 来访页面
    * @param status 状态
    * @param valid 判断是否合法
    */
  case class KPI(
                remote_addr:String,
                time_local:String,
                request:String,
                status:String,
                var valid:Boolean = true
                ) extends Serializable

  /**
    * 对数据进行格式化
    * @param line 每行的数据
    * @return 对每行进行格式化后的KPI
    */
  def parser(line:String) : KPI ={
    val fields = line.split(" ")
    val remote_addr = fields(0)
    val time_local = fields(3).substring(1)
    val request = fields(6)
    val status = fields(8)
    var valid = true
    if(fields.length<=11){
      valid = false
    }else{
      valid = if(status.toInt>=400) false else true
    }
    val url = if(request.indexOf("?") != -1) request.substring(0,request.indexOf("?")) else request
    KPI(remote_addr,time_local,url,status,valid)
  }

  /**
    * 用来过滤有效数据
    * @param line 每行数据
    * @return 封装好的每行数据
    */
  def filterPVs(line:String):KPI ={
    val kpi = parser(line)
    kpi.valid = false
    for(page <- pages){
      if(kpi.request!=null && kpi.request.contains(page)){
        kpi.valid = true
      }
    }
    return kpi;
  }

  /**
    * 将nginx值日时间转化为常规日期
    * @param time_local 时间字符串
    * @return date
    */
  def getTime_Local_Data(time_local:String):Date={
    val df = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",Locale.US)
    df.parse(time_local)
  }

  /**
    * 日期格式化
    * @param time_local 时间字符串
    * @return 格式化后的日期
    */
  def getTime_local_day(time_local:String):String = {
    val df = new SimpleDateFormat("yyyy-MM-dd")
    df.format(getTime_Local_Data(time_local))
  }

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("pv").setMaster("spark://hadoop:7077")
    val sc = new SparkContext(conf)
    //"C:\\Users\\rf\\Desktop\\测试文件\\nlog.txt"

    val a = "a"

    val rdd = sc.textFile("C:\\Users\\rf\\Desktop\\测试文件\\nlog.txt").map(
      line =>{
        //封装数据
        filterPVs(line)
      }
    ).filter(
      //过滤有效数据
      line => line.valid
    ).map(
      x => {
        //进行封装k-v
        //同一时间并且同一url为k
        ((getTime_local_day(x.time_local),x.request),1)
      }
    ).reduceByKey(_+_)
    //排序
    val rdd2 = rdd.sortBy(x =>PVSort(x._1._1,x._2))
    rdd2.map(x =>{
      x._1._1+"\t"+x._1._2+"\t"+x._2
    }).saveAsTextFile("C:\\Users\\rf\\Desktop\\测试文件\\result\\nlogPV4")
    //rdd2.join()
    sc.stop()
    println(a+"--------------------------")

  }

  /**
    * 自定义排序  , 日期升序 ,点击量降序
    * @param date 日期
    * @param count 点击量
    */
  case class PVSort(date:String,count:Int) extends Ordered[PVSort] with Serializable{
    override def compare(that: PVSort): Int = {
      val i = this.date.compareTo(that.date)
      if(i==0){
        return -this.count.compareTo(that.count)
      }else{
        return i
      }
    }
  }
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值