1.需求一:Top10热门品类
2.需求2:Top10热门品类中每个品类的Top10活跃Session统计
3.需求3:页面单跳转化率统计
Spark-core电商分析
需求一代码
package demo
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.Test
class demo电商分析 {
@Test
def methods_1(): Unit ={
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("pro_1")
val sc = new SparkContext(conf)
val rdd: RDD[String] = sc.textFile(" user_visit_action.txt", 2)
val rdd1: RDD[Array[String]] = rdd.map(f => f.split("_"))
val value = rdd1.map(f => (f(0), f(1), f(2), f(3), f(4), f(5), f(6),
f(7), f(8), f(9), f(10), f(11), f(12)))
value.cache()
val tuples: Array[(String, Int)] = value.filter(f => f._7.toInt != -1)
.map(f => f._7)
.groupBy(f => f)
.map(f => (f._1, f._2.size))
.sortBy(f => f._2, false)
.take(10)
println("-------------------------------------------------------------")
val bc: Broadcast[Array[String]] = sc.broadcast(tuples.map(f => f._1).toArray)
value.filter(f=>f._7.toInt != -1 && bc.value.contains(f._7))
.map(f=>((f._7,f._3),1))
.reduceByKey((v1,v2)=>v1+v2)
.map(f=>(f._1._1,(f._1._2,f._2)))
.groupByKey()
.mapValues(f=>f.toList.sortBy(f=>f._2).take(10))
.foreach(f=>println("品类编号:"+f._1,"SessionId:"+f._2))
}
}
需求二代码
UserVisitAction
package homework.homework4
case class UserVisitAction(
var date: String,
var user_id: Long,
var session_id: String,
var page_id: Long,
var action_time: String,
var search_keyword: String,
var click_category_id: Long,
var click_product_id: Long,
var order_category_ids: String,
var order_product_ids: String,
var pay_category_ids: String,
var pay_product_ids: String,
var city_id: Long)
case class totalcount(
var id:String,
var clinkcount:Long,
var ordercount:Long,
var paycount:Long
)
package homework.homework4
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
import org.junit.Test
import scala.collection.mutable.ListBuffer
class 电商分析2 {
@Test
def mainMethod():Unit={
val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("pro_1")
val sc = new SparkContext(conf)
val rdd: RDD[String] = sc.textFile(" user_visit_action.txt", 2)
val value: RDD[UserVisitAction] = rdd.map(data => {
val f: Array[String] = data.split("_")
UserVisitAction(
f(0),
f(1).toLong,
f(2),
f(3).toLong,
f(4),
f(5),
f(6).toLong,
f(7).toLong,
f(8),
f(9),
f(10),
f(11),
f(12).toLong)
})
val value1: RDD[totalcount] = value.flatMap(
f => {
if (f.click_category_id != -1) {
List(totalcount(f.click_category_id.toString, 1, 0, 0))
} else if (f.order_category_ids != null) {
val totalcounts: ListBuffer[totalcount] = new ListBuffer[totalcount]
for (elem <- f.order_category_ids.split(",")) {
val totalcount1: totalcount = totalcount(elem, 0, 1, 0)
totalcounts.append(totalcount1)
}
totalcounts
} else if (f.pay_category_ids != null) {
val totalcounts: ListBuffer[totalcount] = new ListBuffer[totalcount]
for (elem <- f.pay_category_ids.split(",")) {
val totalcount1: totalcount = totalcount(elem, 0, 0, 1)
totalcounts.append(totalcount1)
}
totalcounts
} else {
Nil
}
}
)
val value2: RDD[(String, Iterable[totalcount])] = value1.groupBy(f => f.id)
value2
.mapValues(f => {
f.reduce {
(info1, info2) => {
info1.clinkcount = info1.clinkcount + info2.clinkcount
info1.ordercount = info1.ordercount + info2.ordercount
info1.paycount = info1.paycount + info2.paycount
info1
}
}
})
.map(f=>f._2)
.sortBy(f=>(f.clinkcount,f.ordercount,f.paycount),false)
.take(10)
.foreach(f=>println(f))
sc.stop()
}
}