数据格式:取三行样本数据
50.116.27.194 - - [18/Sep/2013:07:11:29 +0000] "POST /wp-cron.php?doing_wp_cron=1379488288.8893849849700927734375 HTTP/1.0" 200 0 "-" "WordPress/3.6; http://itunic.com"
50.116.27.194 - - [17/Sep/2013:07:11:29 +0000] "POST /wp-cron.php?doing_wp_cron=1379488288.8893849849700927734375 HTTP/1.0" 200 0 "-" "WordPress/3.6; http://itunic.com"
50.116.27.194 - - [17/Sep/2013:07:11:29 +0000] "POST /wp-cron.php?doing_wp_cron=1379488288.8893849849700927734375 HTTP/1.0" 200 0 "-" "WordPress/3.6; http://itunic.com"
代码:
import java.text.SimpleDateFormat
import java.util.{Date, Locale}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
object NgixLogPV{
/**
* 定义需要过滤的页面后缀名
*/
val pages = new mutable.HashSet[String]()
pages.add(".jsp")
pages.add(".html")
pages +=".php"
/**
* 来访垟利
* @param remote_addr 来访ip
* @param time_local 来访时间
* @param request 来访页面
* @param status 状态
* @param valid 判断是否合法
*/
case class KPI(
remote_addr:String,
time_local:String,
request:String,
status:String,
var valid:Boolean = true
) extends Serializable
/**
* 对数据进行格式化
* @param line 每行的数据
* @return 对每行进行格式化后的KPI
*/
def parser(line:String) : KPI ={
val fields = line.split(" ")
val remote_addr = fields(0)
val time_local = fields(3).substring(1)
val request = fields(6)
val status = fields(8)
var valid = true
if(fields.length<=11){
valid = false
}else{
valid = if(status.toInt>=400) false else true
}
val url = if(request.indexOf("?") != -1) request.substring(0,request.indexOf("?")) else request
KPI(remote_addr,time_local,url,status,valid)
}
/**
* 用来过滤有效数据
* @param line 每行数据
* @return 封装好的每行数据
*/
def filterPVs(line:String):KPI ={
val kpi = parser(line)
kpi.valid = false
for(page <- pages){
if(kpi.request!=null && kpi.request.contains(page)){
kpi.valid = true
}
}
return kpi;
}
/**
* 将nginx值日时间转化为常规日期
* @param time_local 时间字符串
* @return date
*/
def getTime_Local_Data(time_local:String):Date={
val df = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",Locale.US)
df.parse(time_local)
}
/**
* 日期格式化
* @param time_local 时间字符串
* @return 格式化后的日期
*/
def getTime_local_day(time_local:String):String = {
val df = new SimpleDateFormat("yyyy-MM-dd")
df.format(getTime_Local_Data(time_local))
}
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("pv").setMaster("spark://hadoop:7077")
val sc = new SparkContext(conf)
//"C:\\Users\\rf\\Desktop\\测试文件\\nlog.txt"
val a = "a"
val rdd = sc.textFile("C:\\Users\\rf\\Desktop\\测试文件\\nlog.txt").map(
line =>{
//封装数据
filterPVs(line)
}
).filter(
//过滤有效数据
line => line.valid
).map(
x => {
//进行封装k-v
//同一时间并且同一url为k
((getTime_local_day(x.time_local),x.request),1)
}
).reduceByKey(_+_)
//排序
val rdd2 = rdd.sortBy(x =>PVSort(x._1._1,x._2))
rdd2.map(x =>{
x._1._1+"\t"+x._1._2+"\t"+x._2
}).saveAsTextFile("C:\\Users\\rf\\Desktop\\测试文件\\result\\nlogPV4")
//rdd2.join()
sc.stop()
println(a+"--------------------------")
}
/**
* 自定义排序 , 日期升序 ,点击量降序
* @param date 日期
* @param count 点击量
*/
case class PVSort(date:String,count:Int) extends Ordered[PVSort] with Serializable{
override def compare(that: PVSort): Int = {
val i = this.date.compareTo(that.date)
if(i==0){
return -this.count.compareTo(that.count)
}else{
return i
}
}
}
}