代码如下:
```scala
import org.apache.spark.{SparkConf, SparkContext}
object ECommerceAnalysis {
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("ECommerceAnalysis").setMaster("local[*]")
val sc = new SparkContext(conf)
// 读取数据
val cityInfo = sc.textFile("city_info.txt")
val productInfo = sc.textFile("product_info.txt")
val userVisitAction = sc.textFile("user_visit_action.txt")
// 1. 统计最受欢迎的品类,先排序点击-再是订单-最后是支付
val categoryCount = userVisitAction.flatMap(line => {
val fields = line.split("\t")
if (fields(6) != "-1") {
// 点击
List((fields(6), (1, 0, 0)))
} else if (fields(8) != "null") {
// 订单
val orderIds = fields(8).split(",")
orderIds.map(orderId => (orderId, (0, 1, 0)))
} else if (fields(10) != "null") {
// 支付
val paymentIds = fields(10).split(",")
paymentIds.map(paymentId => (paymentId, (0, 0, 1)))
} else {
Nil
}
}).reduceByKey((x, y) => (x._1 + y._1, x._2 + y._2, x._3 + y._3))
.map{case (categoryId, (clickCount, orderCount, paymentCount)) =>
(clickCount, orderCount, paymentCount, categoryId)
}.sortBy(x => (x._1, x._2, x._3), false)
.take(10)
categoryCount.foreach{case (clickCount, orderCount, paymentCount, categoryId) =>
println(s"Category $categoryId: ClickCount: $clickCount, OrderCount: $orderCount, PaymentCount: $paymentCount")
}
// 2. 统计页面跳转率
val pageJumpCount = userVisitAction.map(line => {
val fields = line.split("\t")
val pageId = fields(5).toInt
val sessionId = fields(2)
val actionTime = fields(4)
val actionType = fields(6).toInt
(sessionId, (pageId, actionTime, actionType))
}).groupByKey().flatMap{case (sessionId, iter) =>
val actions = iter.toList.sortBy(_._2)
val pageJumpActions = actions.zip(actions.tail)
.filter{case (action1, action2) => action1._3 == 0 && action2._3 == 1}
pageJumpActions.map{case ((pageId1, time1, _), (pageId2, time2, _)) =>
((pageId1, pageId2), 1)
}
}.reduceByKey(_ + _)
val pageVisitCount = userVisitAction.map(line => {
val fields = line.split("\t")
val pageId = fields(5).toInt
val sessionId = fields(2)
(sessionId, pageId)
}).distinct().groupByKey().map{case (sessionId, iter) =>
val pageIds = iter.toList.sortBy(x => x)
val pageVisitPairs = pageIds.zip(pageIds.tail)
pageVisitPairs.map(pair => (pair, 1))
}.flatMap(x => x).reduceByKey(_ + _)
val pageJumpRate = pageJumpCount.join(pageVisitCount).map{case ((pageId1, pageId2), (jumpCount, visitCount)) =>
(pageId1, (pageId2, jumpCount.toDouble / visitCount))
}.groupByKey().map{case (pageId, iter) =>
val pageJumpInfo = iter.toList.sortBy(-_._2).take(10)
(pageId, pageJumpInfo)
}
pageJumpRate.foreach{case (pageId, pageJumpInfo) =>
println(s"Page $pageId: ${pageJumpInfo.mkString(", ")}")
}
// 3. 不同区域内的热门商品Top3
val cityProductCount = userVisitAction.flatMap(line => {
val fields = line.split("\t")
val cityId = fields(3)
val productId = fields(6)
if (productId != "-1") {
List(((cityId, productId), 1))
} else {
Nil
}
}).reduceByKey(_ + _)
val cityTop3Product = cityProductCount.map{case ((cityId, productId), count) =>
(cityId, (productId, count))
}.groupByKey().map{case (cityId, iter) =>
val top3Product = iter.toList.sortBy(-_._2).take(3)
(cityId, top3Product)
}
val cityMap = cityInfo.map(line => {
val fields = line.split(" ")
(fields(0), fields(1))
}).collectAsMap()
cityTop3Product.map{case (cityId, top3Product) =>
val cityName = cityMap.getOrElse(cityId, "Unknown")
val top3ProductStr = top3Product.map{case (productId, count) =>
s"$productId:${count}"
}.mkString(", ")
(cityName, top3ProductStr)
}.foreach{case (cityName, top3ProductStr) =>
println(s"$cityName: $top3ProductStr")
}
// 4. 自定义需求
// 按小时统计访问人数
val hourVisitCount = userVisitAction.map(line => {
val fields = line.split("\t")
val timestamp = fields(4)
val hour = timestamp.substring(11, 13)
(hour, 1)
}).reduceByKey(_ + _)
hourVisitCount.sortByKey().foreach{case (hour, count) =>
println(s"Hour $hour: $count")
}
sc.stop()
}
}
```