DWS层:商品主题宽表处理
与访客的dws层的宽表类似,也是把多个事实表的明细数据汇总起来组合成宽表
1. 需求分析与思路
- 从Kafka主题中获得数据流
- 把Json字符串数据流转换为统一数据对象的数据流
- 把统一的数据结构流合并为一个流
flink多流join使用connect方法,前提是必须是相同类型的数据流,所有需要先封装主题宽表的bean对象,后续进行补充维度信息,分组聚合去重。 - 设定事件时间与水位线
- 分组、开窗、聚合
- 写入ClickHouse
2. 功能实现
2.1 封装商品统计实体类ProductStats
import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;
import java.math.BigDecimal;
import java.util.HashSet;
import java.util.Set;
@Data
@NoArgsConstructor
@AllArgsConstructor
public class ProductStats {
private String stt;//窗口起始时间
private String edt; //窗口结束时间
private Long sku_id; //sku编号
private String sku_name;//sku名称
private BigDecimal sku_price; //sku单价
private Long spu_id; //spu编号
private String spu_name;//spu名称
private Long tm_id; //品牌编号
private String tm_name;//品牌名称
private Long category3_id;//品类编号
private String category3_name;//品类名称
private Long display_ct = 0L; //曝光数
private Long click_ct = 0L; //点击数
private Long favor_ct = 0L; //收藏数
private Long cart_ct = 0L; //添加购物车数
private Long order_sku_num = 0L; //下单商品个数
//下单商品金额 不是整个订单的金额
private BigDecimal order_amount = BigDecimal.ZERO;
private Long order_ct = 0L; //订单数
//支付金额
private BigDecimal payment_amount = BigDecimal.ZERO;
private Long paid_order_ct = 0L; //支付订单数
private Long refund_order_ct = 0L; //退款订单数
private BigDecimal refund_amount = BigDecimal.ZERO;
private Long comment_ct = 0L;//评论订单数
private Long good_comment_ct = 0L; //好评订单数
private Set<Long> orderIdSet = new HashSet<>(); //用于统计订单数
private Set<Long> paidOrderIdSet = new HashSet<>(); //用于统计支付订单数
private Set<Long> refundOrderIdSet = new HashSet<>();//用于退款支付订单数
private Long ts; //统计时间戳
}
2.2 消费Kfka数据, 合成一个流
public void run(StreamExecutionEnvironment env,
Map<String, DataStreamSource<String>> sourceStreams) {
//1. 解析成8个流,合并成一个流
final DataStream<ProductStats> productStatsDataStream = parseStreamAndUnionOneStream(sourceStreams);
//2.开窗聚合
final SingleOutputStreamOperator<ProductStats> productStatsAggStream = aggregateByDim(productStatsDataStream);
//3.读取维度数据
final SingleOutputStreamOperator<ProductStats> productStatsWithDimStreeam = joinDims(productStatsAggStream);
productStatsWithDimStreeam.print();
//4.数据写入到clickhouse中
sink2ClickHouse(productStatsWithDimStreeam);
}
//1. 解析成8个流,合并成一个流
private DataStream<ProductStats> parseStreamAndUnionOneStream(Map<String, DataStreamSource<String>> sourceStreams) {
//1.解析得到页面点击流
final SingleOutputStreamOperator<ProductStats> productClickStream = sourceStreams
.get("dwd_page_log")
.flatMap(new FlatMapFunction<String, ProductStats>() {
@Override
public void flatMap(String json,
Collector<ProductStats> out) throws Exception {
final JSONObject obj = JSON.parseObject(json);
final JSONObject pageObj = obj.getJSONObject("page");
final String pageId = pageObj.getString("page_id");
if ("good_detail".equalsIgnoreCase(pageId)) {
final Long skuId = pageObj.getLong("item");
final Long ts = obj.getLong("ts");
final ProductStats ps = new ProductStats();
ps.setSku_id(skuId);
ps.setClick_ct(1l);
ps.setTs(ts);
out.collect(ps);
}
}
});
//2.曝光率
final SingleOutputStreamOperator<ProductStats> dispalyStream = sourceStreams
.get("dwd_display_log")
.process(new ProcessFunction<String, ProductStats>() {
@Override
public void processElement(String json,
Context ctx,
Collector<ProductStats> out) throws Exception {
final JSONObject obj = JSON.parseObject(json);
final String itemType = obj.getString("item_type");
if ("sku_id".equalsIgnoreCase(itemType)){
final ProductStats ps = new ProductStats();
final Long skuId = obj.getLong("item");
final Long ts = obj.getLong("ts");
ps.setSku_id(skuId);
ps.setTs(ts);
ps.setDisplay_ct(1l);
out.collect(ps);
}
}
});
// productClickStream.print("product");
// dispalyStream.print("display");
//3.收藏流
final SingleOutputStreamOperator<ProductStats> favorStream = sourceStreams
.get("dwd_favor_info")
.map(json -> {
final JSONObject obj = JSON.parseObject(json);
final Long skuId = obj.getLong("sku_id");
final Long ts = MyTimeUtil.toTs(obj.getString("create_time"));
final ProductStats ps = new ProductStats();
ps.setSku_id(skuId);
ps.setTs(ts);
ps.setFavor_ct(1l);
return ps;
});
//4.购物车流
final SingleOutputStreamOperator<ProductStats> cartStream = sourceStreams
.get("dwd_cart_info")
.map(json -> {
final JSONObject obj = JSON.parseObject(json);
final Long skuId = obj.getLong("sku_id");
final Long ts = MyTimeUtil.toTs(obj.getString("create_time"));
final ProductStats ps = new ProductStats();
ps.setSku_id(skuId);
ps.setTs(ts);
ps.setCart_ct(1l);
return ps;
});
//5.订单流
final SingleOutputStreamOperator<ProductStats> orderStream = sourceStreams
.get("dwm_order_wide")
.map(json -> {
final OrderWide orderWide = JSON.parseObject(json, OrderWide.class);
final ProductStats ps = new ProductStats();
ps.setSku_id(orderWide.getSku_id());
ps.setTs(MyTimeUtil.toTs(orderWide.getCreate_time()));
ps.getOrderIdSet().add(orderWide.getOrder_id());
//System.out.println(ps.getOrderIdSet());
ps.setOrder_amount(orderWide.getSplit_total_amount());
ps.setOrder_sku_num(orderWide.getSku_num());
return ps;
});
//6.支付流
final SingleOutputStreamOperator<ProductStats> paymentStream = sourceStreams
.get("dwm_payment_wide")
.map(json -> {
final PaymentWide paymentWide = JSON.parseObject(json, PaymentWide.class);
final ProductStats ps = new ProductStats();
ps.setSku_id(paymentWide.getSku_id());
ps.setTs(MyTimeUtil.toTs(paymentWide.getPayment_create_time()));
ps.getPaidOrderIdSet().add(paymentWide.getOrder_id());
ps.setPayment_amount(paymentWide.getSplit_total_amount());
return ps;
});
//7.退款流
final SingleOutputStreamOperator<ProductStats> refundStream = sourceStreams
.get("dwd_order_refund_info")
.map(json -> {
final JSONObject obj = JSON.parseObject(json);
final Long skuId = obj.getLong("sku_id");
final Long ts = MyTimeUtil.toTs(obj.getString("create_time"));
final BigDecimal refundAmount = obj.getBigDecimal("refund_amount");
final ProductStats ps = new ProductStats();
ps.setSku_id(skuId);
ps.setTs(ts);
ps.getRefundOrderIdSet().add(obj.getLong("order_id"));
ps.setRefund_amount(refundAmount);
return ps;
});
//8.评论流
final SingleOutputStreamOperator<ProductStats> commentStream = sourceStreams
.get("dwd_comment_info")
.map(json -> {
final JSONObject obj = JSON.parseObject(json);
final Long skuId = obj.getLong("sku_id");
final Long ts = MyTimeUtil.toTs(obj.getString("create_time"));
final ProductStats ps = new ProductStats();
ps.setSku_id(skuId);
ps.setTs(ts);
ps.setComment_ct(1l);
final String appraise = obj.getString("appraise");
if (GmallConstant.APPRAISE_GOOD.equalsIgnoreCase(appraise)){
ps.setGood_comment_ct(1l);
}
return ps;
});
return productClickStream.union(
dispalyStream,
favorStream,
cartStream,
orderStream,
paymentStream,
refundStream,
commentStream
);
}
2.3 开窗, 聚合
//2.开窗聚合
private SingleOutputStreamOperator<ProductStats> aggregateByDim(DataStream<ProductStats> productStatsDataStream) {
return productStatsDataStream
.assignTimestampsAndWatermarks(
WatermarkStrategy
.<ProductStats>forBoundedOutOfOrderness(Duration.ofSeconds(5))
.withTimestampAssigner((ps,ts)->ps.getTs())
)
.keyBy(ProductStats::getSku_id)
.window(TumblingEventTimeWindows.of(Time.seconds(5)))
.reduce(new ReduceFunction<ProductStats>() {
@Override
public ProductStats reduce(ProductStats s1,
ProductStats s2) throws Exception {
s1.setFavor_ct(s1.getFavor_ct() + s2.getFavor_ct());
s1.setClick_ct(s1.getClick_ct() + s2.getClick_ct());
s1.setDisplay_ct(s1.getDisplay_ct() + s2.getDisplay_ct());
s1.setCart_ct(s1.getCart_ct() + s2.getCart_ct());
s1.setOrder_amount(s1.getOrder_amount().add(s2.getOrder_amount()));
s1.setOrder_sku_num(s1.getOrder_sku_num() + s2.getOrder_sku_num());
s1.setPayment_amount(s1.getPayment_amount().add(s2.getPayment_amount()));
s1.setRefund_amount(s1.getRefund_amount().add(s2.getRefund_amount()));
s1.setComment_ct(s1.getComment_ct() + s2.getComment_ct());
s1.setGood_comment_ct(s1.getGood_comment_ct() + s2.getGood_comment_ct());
s1.getOrderIdSet().addAll(s2.getOrderIdSet());
s1.getPaidOrderIdSet().addAll(s2.getPaidOrderIdSet());
s1.getRefundOrderIdSet().addAll(s2.getRefundOrderIdSet());
return s1;
}
}, new ProcessWindowFunction<ProductStats, ProductStats, Long, TimeWindow>() {
@Override
public void process(Long key,
Context ctx,
Iterable<ProductStats> elements,
Collector<ProductStats> out) throws Exception {
final ProductStats ps = elements.iterator().next();
final TimeWindow w = ctx.window();
ps.setStt(MyTimeUtil.toDateTimeString(w.getStart()));
ps.setEdt(MyTimeUtil.toDateTimeString(w.getEnd()));
ps.setOrder_ct((long) ps.getOrderIdSet().size());
ps.setPaid_order_ct((long) ps.getPaidOrderIdSet().size());
ps.setRefund_order_ct((long) ps.getRefundOrderIdSet().size());
out.collect(ps);
}
});
}
2.4 补充维度信息
封装异步读取维度函数
为了方便每次异步读取维度数据, 对异步函数做封装.
package com.gmall.realtime.util;
import com.alibaba.fastjson.JSON;
import com.alibaba.fastjson.JSONObject;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.async.ResultFuture;
import org.apache.flink.streaming.api.functions.async.RichAsyncFunction;
import redis.clients.jedis.Jedis;
import java.sql.Connection;
import java.sql.DriverManager;
import java.util.Collections;
import java.util.concurrent.ThreadPoolExecutor;
public abstract class DimAsyncFunction<T> extends RichAsyncFunction<T,T> {
private String phoenixUrl;
private Connection conn;
private ThreadPoolExecutor pool;
@Override
public void open(Configuration parameters) throws Exception {
phoenixUrl = "jdbc:phoenix:hadoop162,hadoop163,hadoop164:2181";
conn = DriverManager.getConnection(phoenixUrl);
pool = MyThreadPoolUtil.getThreadPool();
}
@Override
public void asyncInvoke(T input,
ResultFuture<T> resultFuture) throws Exception {
pool.execute(new Runnable() {
@Override
public void run() {
final Jedis redisClient = MyRedisUtil.getRedisClient();
//添加维度信息,和具体业务相关
addDim(input, redisClient);
resultFuture.complete(Collections.singleton(input));
redisClient.close();
}
});
}
public abstract void addDim(T input, Jedis redisClient);
public JSONObject readDim(Jedis redisClient, String tableName, Object id) {
//先从缓存读取数据,没有命中,再去hbase读取数据
String key = tableName + ":" + id;
if (redisClient.exists(key)) {
System.out.println(tableName + "的 id " + id + "走缓存");
final String dataJson = redisClient.get(key);
return JSON.parseObject(dataJson);
} else {
System.out.println(tableName + "的 id " + id + "走数据库");
String sql = "select * from " + tableName + " where ID = ?";
final JSONObject jsonObject = MyJDBCUtil
.queryList(conn, sql, new Object[]{id}, JSONObject.class, false)
.get(0);
//把数据缓存进入redis,过期时间一周
redisClient.setex(key, 60 * 60 * 24 * 7, jsonObject.toJSONString());
return jsonObject;
}
}
@Override
public void close() throws Exception {
if (conn != null && !conn.isClosed()) {
conn.close();
}
if (pool != null && !pool.isShutdown()) {
pool.shutdown();
}
}
}
补充维度信息
//3.读取维度数据
private SingleOutputStreamOperator<ProductStats> joinDims(SingleOutputStreamOperator<ProductStats> productStatsAggStream) {
return AsyncDataStream
.unorderedWait(
productStatsAggStream,
new DimAsyncFunction<ProductStats>() {
@Override
public void addDim(ProductStats input, Jedis redisClient) {
//读取维度表
//1.先读取sku_info
final JSONObject skuInfo = readDim(redisClient, "DIM_SKU_INFO", input.getSku_id());
input.setSku_name(skuInfo.getString("SKU_NAME"));
input.setSku_price(skuInfo.getBigDecimal("PRICE"));
input.setCategory3_id(skuInfo.getLong("CATEGORY3_ID"));
input.setSpu_id(skuInfo.getLong("SPU_ID"));
input.setTm_id(skuInfo.getLong("TM_ID"));
//2.读取spu_info
final JSONObject spuInfo = readDim(redisClient, "DIM_SPU_INFO", input.getSpu_id());
input.setSpu_name(spuInfo.getString("SPU_NAME"));
//3.读取品牌表
final JSONObject tmInfo = readDim(redisClient, "DIM_BASE_TRADEMARK", input.getTm_id());
input.setTm_name(tmInfo.getString("TM_NAME"));
//4.读取三级品类
final JSONObject c3Info = readDim(redisClient, "DIM_BASE_CATEGORY3", input.getCategory3_id());
input.setCategory3_name(c3Info.getString("NAME"));
}
},
30,
TimeUnit.SECONDS);
}
2.5 写入到ClickHouse
在ClickHouse中创建主题宽表
use gmall2021;
create table product_stats_2021 (
stt DateTime,
edt DateTime,
sku_id UInt64,
sku_name String,
sku_price Decimal64(2),
spu_id UInt64,
spu_name String ,
tm_id UInt64,
tm_name String,
category3_id UInt64,
category3_name String ,
display_ct UInt64,
click_ct UInt64,
favor_ct UInt64,
cart_ct UInt64,
order_sku_num UInt64,
order_amount Decimal64(2),
order_ct UInt64 ,
payment_amount Decimal64(2),
paid_order_ct UInt64,
refund_order_ct UInt64,
refund_amount Decimal64(2),
comment_ct UInt64,
good_comment_ct UInt64 ,
ts UInt64
)engine =ReplacingMergeTree( ts)
partition by toYYYYMMDD(stt)
order by (stt,edt,sku_id );
写数据到ClickHouse 中
private void sink2Clickhouse(SingleOutputStreamOperator<ProductStats> resultStream) {
resultStream.addSink(MySinkUtil.getClickHouseSink("gmall2021", "product_stats_2021", ProductStats.class));
}
MySinkUtil详见:https://blog.csdn.net/weixin_42796403/article/details/115144073