使用sparksql开发pv,uv,二跳率

uv:user views,count(distinct guid)

pv:page views,count(url)

二跳率:count(distinct case when pv>=2 then sessionid else null end) / count(distinct sessionid)

import org.apache.spark.SparkConf
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel

/**
  * Created by zengxiaosen on 16/9/20.
  */
object visit {

  def main(args: Array[String]): Unit = {
    val sparkconf = new SparkConf().setAppName("visitCount").setMaster("local")
    val ss = SparkSession.builder().config(sparkconf).getOrCreate()
    val sc = ss.sparkContext
    import ss.implicits._

    val fileRDD = sc.textFile("/opt/tarballs/spark_kafka/beifengspark/src/main/scala/2015082818")
      .filter(line => line.length>0)
      .map{ line =>
        val arr = line.split("\t")
        val date = arr(17).substring(0, 10)
        val guid = arr(5)
        val sessionid = arr(10)
        val url = arr(1)
        (date,guid,sessionid,url)
        //通过url过滤
      }.filter(i => i._4.length>0).toDF("date","guid","sessionid","url")
      .persist(StorageLevel.DISK_ONLY)

    fileRDD.createOrReplaceTempView("log")
    /*
    guid是独立访客id,大于sessionid
     */
    val sql =
      s"""
         |select date,count(distinct guid) uv,sum(pv) pv,
         |count(case when pv>=2 then sessionid else null end) second_num,
         |count(sessionid) visits from
         |(select date, sessionid, max(guid) guid, count(url) pv from log
         |group by date,sessionid) a
         |group by date
       """.stripMargin

    val sql01 =
      s"""
         |select date,count(distinct guid) uv, count(url) pv from log
         |group by date
       """.stripMargin

    val result = ss.sql(sql).cache()
    result.show()
    result.printSchema()

    sc.stop()
    ss.stop()
  }

}

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值