class ApacheAccessLog(
var ipAddress:String,//地址
var clientIndextd:String,//标示符
var userID:String,//用户的id
var dateTime:String,//日期
var method:String,//请求方式
var endPoint:String,//目标地址
var responseCode:Int,//网页请求状态码
var contentSize:Long //内容大小
) {
}
object ApacheAccessLog{
def parseLog(log:String):ApacheAccessLog={
val logArray = log.split("#")
val url = logArray(4).split("/")
new ApacheAccessLog(logArray(0),logArray(1),logArray(2),logArray(4),url(0),url(1),logArray(5).toInt,logArray(6).toLong)
}
}
package lesson02
import org.apache.spark.sql.{DataFrame, SQLContext}
import org.apache.spark.{SparkConf, SparkContext}
object ApacheLogBySql {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("sql").setMaster("local")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
val logFile = sc.textFile("D:\\log.txt")
val ApacheLog = logFile.map(line => ApacheAccessLog.parseLog(line))
import sqlContext.implicits._
val df:DataFrame = ApacheLog.toDF()
df.registerTempTable("log")
//SparkSQL的调优,将注册的表存在缓存中
sqlContext.cacheTable("log")
/**
* 需求一:The average, min, and max content size of responses returned from the server.
*/
val sql1=
"""
select avg(contentSize),min(contentSize),max(contentSize) from log
"""
sqlContext.sql(sql1)
/**
*需求而: A count of response code's returned.
*/
val sql2=
"""
select
responseCode,count(*)
from
log
group by
responseCode
"""
sqlContext.sql(sql2)
/**
* 需求三:All IPAddresses that have accessed this server more than N times. 3
*/
val sql3=
"""
select
ipAddress,count(*) as total
from
log
group by
ipAddress
having
total > 1
"""
sqlContext.sql(sql3);
/**
* 需求四:The top endpoints requested by count.
*/
val sql4=
"""
select
endPoint,count(*) as count
from
log
group by
endPoint
order by
count desc
limit 2
"""
sqlContext.sql(sql4)
}
}
855

被折叠的 条评论
为什么被折叠?



