1. 案例一: 通过Spark实现点击流日志分析
前言:pv, uv概念
- **PV: ** 重视每一次的访问, 一个用户访问一次就记录一次. 访问一个页面算作一次PV
- UV: 重视每一次会话的情况, 30分钟内如果同一个seesion一直访问不同页面,则只记录一个UV,一天之内访问网站不重复的用户数. 不重复的用户越多, 说明我这个网站访问的人数越多. 使用cookie来进行区分不同的用户.
1.1 Spark统计PV
package cn.SparkPV
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf,SparkContext}
object PV{
def main(args:Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("PV").setMaster("local[2]")
val sc: SparkContext = new SparkContext(sparkConf)
val file: RDD[String] = sc.textFile("d:\\data\\access.log")
val pvAndOne: RDD[(String, Int)] = file.map(x => ("pv", 1))
val totalPV: RDD[(String, Int)] = pvAndOne.reduceByKey(_+_)
totalPV.foreach(println)
sc.stop()
}
}
1.2 Spark统计UV
package cn.sparkUV
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
object UV{
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("UV").setMaster("local[2]")
val sc: SparkContext = new SparkContext(sparkConf)
val file: RDD[String] = sc.textFile("d:\\data\\access.log")
val ips: RDD[(String)] = file.map(_.split(" ")).map(x => x(0))
val uvAndOne: RDD[(String, Int)] = ips.distinct().map(x => ("UV", 1))
val totalUV: RDD[(String, Int)] = uvAndOne.reduceByKey(_+_)
totalUV.foreach(println)
totalUV.saveAsTextFile("d:\\data\\out")
sc.stop()
}
}
1.3 Spark统计访问的topN
package cn.sparkTopN
import org.apache.spark.rdd.RDD
import org.agache.spark.{SparkConf, SparkContext}
object TopN{
def main(args: Array[String]): Unit = {
val sparkConf: SparkConf = new SparkConf().setAppName("TopN").setMaster("local[2]")
sc.setLogLevel("WARN")
val file: RDD[String] = sc.textFile("d:\\data\\access.log")
val refUrlAndOne: RDD[(String,Int)] = file.map(_.split(" ")).filter(_.length>10).map(x=>(x(10),1))
val result: RDD[(String,Int)] = refUrlAndOne.reduceByKey(_+_).sortBy(_._2. false)
val finalResult:Array[(String,Int)] = result.take(5)
println(finalResult.toBuffer)
sc.stop()
}
}