准备
需求1:Top10热门品类
需求说明:品类是指产品的分类,大型电商网站品类分多级,咱们的项目中品类只有一级,不同的公司可能对热门的定义不一样。我们按照每个品类的点击、下单、支付的量来统计热门品类。
鞋 点击数 下单数 支付数
衣服 点击数 下单数 支付数
电脑 点击数 下单数 支付数
例如,综合排名 = 点击数20% + 下单数30% + 支付数*50%
本项目需求优化为:先按照点击数排名,靠前的就排名高;如果点击数相同,再比较下单数;下单数再相同,就比较支付数。
6.2.1 需求分析(方案一)分步计算
思路:分别统计每个品类点击的次数,下单的次数和支付的次数。
(品类,点击总数)(品类,下单总数)(品类,支付总数)
缺点:统计3次,需要启动3个job,每个job都有对原始数据遍历一次,效率低。
6.2.2 需求分析(方案二)常规算子
6.2.3 需求分析(方案三)样例类
1)用来封装用户行为的样例类
//用户访问动作表
package com.atguigu.project01;
import java.io.Serializable;
public class UserVisitAction implements Serializable
{
private String date;
private Long user_id;
private String session_id;
private Long page_id;
private String action_time;
private String search_keyword;
private Long click_category_id;
private Long click_product_id;
private String order_category_ids;
private String order_product_ids;
private String pay_category_ids;
private String pay_product_ids;
private Long city_id;
public UserVisitAction() {
}
public UserVisitAction(String date, Long user_id, String session_id, Long page_id, String action_time, String search_keyword, Long click_category_id, Long click_product_id, String order_category_ids, String order_product_ids, String pay_category_ids, String pay_product_ids, Long city_id) {
this.date = date;
this.user_id = user_id;
this.session_id = session_id;
this.page_id = page_id;
this.action_time = action_time;
this.search_keyword = search_keyword;
this.click_category_id = click_category_id;
this.click_product_id = click_product_id;
this.order_category_ids = order_category_ids;
this.order_product_ids = order_product_ids;
this.pay_category_ids = pay_category_ids;
this.pay_product_ids = pay_product_ids;
this.city_id = city_id;
}
public String getDate() {
return date;
}
public void setDate(String date) {
this.date = date;
}
public Long getUser_id() {
return user_id;
}
public void setUser_id(Long user_id) {
this.user_id = user_id;
}
public String getSession_id() {
return session_id;
}
public void setSession_id(String session_id) {
this.session_id = session_id;
}
public Long getPage_id() {
return page_id;
}
public void setPage_id(Long page_id) {
this.page_id = page_id;
}
public String getAction_time() {
return action_time;
}
public void setAction_time(String action_time) {
this.action_time = action_time;
}
public String getSearch_keyword() {
return search_keyword;
}
public void setSearch_keyword(String search_keyword) {
this.search_keyword = search_keyword;
}
public Long getClick_category_id() {
return click_category_id;
}
public void setClick_category_id(Long click_category_id) {
this.click_category_id = click_category_id;
}
public Long getClick_product_id() {
return click_product_id;
}
public void setClick_product_id(Long click_product_id) {
this.click_product_id = click_product_id;
}
public String getOrder_category_ids() {
return order_category_ids;
}
public void setOrder_category_ids(String order_category_ids) {
this.order_category_ids = order_category_ids;
}
public String getOrder_product_ids() {
return order_product_ids;
}
public void setOrder_product_ids(String order_product_ids) {
this.order_product_ids = order_product_ids;
}
public String getPay_category_ids() {
return pay_category_ids;
}
public void setPay_category_ids(String pay_category_ids) {
this.pay_category_ids = pay_category_ids;
}
public String getPay_product_ids() {
return pay_product_ids;
}
public void setPay_product_ids(String pay_product_ids) {
this.pay_product_ids = pay_product_ids;
}
public Long getCity_id() {
return city_id;
}
public void setCity_id(Long city_id) {
this.city_id = city_id;
}
}
// 输出结果表
package com.atguigu.project01;
import java.io.Serializable;
public class CategoryCountInfo implements Serializable, Comparable<CategoryCountInfo> {
private String categroyId;
private Long clickCount;
private Long orederCount;
private Long payCount;
public CategoryCountInfo(String categroyId, Long clickCount, Long orederCount, Long payCount) {
this.categroyId = categroyId;
this.clickCount = clickCount;
this.orederCount = orederCount;
this.payCount = payCount;
}
public CategoryCountInfo() {
}
public String getCategroyId() {
return categroyId;
}
public void setCategroyId(String categroyId) {
this.categroyId = categroyId;
}
public Long getClickCount() {
return clickCount;
}
public void setClickCount(Long clickCount) {
this.clickCount = clickCount;
}
public Long getOrederCount() {
return orederCount;
}
public void setOrederCount(Long orederCount) {
this.orederCount = orederCount;
}
public Long getPayCount() {
return payCount;
}
public void setPayCount(Long payCount) {
this.payCount = payCount;
}
@Override
public int compareTo(CategoryCountInfo o) {
//注意 Long不可以直接对比大小
if (this.orederCount.longValue() == o.orederCount.longValue()) {
if (this.clickCount.longValue() == o.clickCount.longValue()) {
if (this.payCount.longValue() == o.payCount.longValue()) {
return 0;
} else {
return Long.compare(this.payCount, o.payCount);
}
} else {
return Long.compare(this.clickCount, o.clickCount);
}
} else {
return Long.compare(this.orederCount, o.orederCount);
}
}
@Override
public String toString() {
return "CategoryCountInfo{" +
"categroyId='" + categroyId + '\'' +
", clickCount=" + clickCount +
", orederCount=" + orederCount +
", payCount=" + payCount +
'}';
}
}
2)核心业务代码实现
package com.atguigu.project01;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.JavaRDD;
import org.apache.spark.api.java.JavaSparkContext;
import java.util.*;
import java.util.stream.StreamSupport;
public class require01_top10Category_method3 {
public static void main(String[] args) {
SparkConf sparkConf = new SparkConf().setMaster("local[*]").setAppName("SparkCoreTest");
JavaSparkContext sparkContext = new JavaSparkContext(sparkConf);
//需求一 统计top10热门品类
//获取原始数据
JavaRDD<String> rdd = sparkContext.textFile("input/user_visit_action.txt");
//将原始数据进行转换
JavaRDD<UserVisitAction> actionRdd = rdd.map(item -> {
//获取一行数据,进行起个,封装到实体类中
String[] datas = item.split("_");
return new UserVisitAction(
datas[0],
Long.parseLong(datas[1]),
datas[2],
Long.parseLong(datas[3]),
datas[4],
datas[5],
Long.parseLong(datas[6]),
Long.parseLong(datas[7]),
datas[8],
datas[9],