(1)DWM 层的定位
DWM 层主要服务 DWS,因为部分需求直接从 DWD 层到DWS 层中间会有一定的计算量,而且这部分计算的结果很有可能被多个DWS 层主题复用,所以部分 DWD 成会形成一层 DWM。
(2)访客 UV
UV,全称是 Unique Visitor,即独立访客, 对于实时计算中,也可以称为 DAU(Daily Active User),即每日活跃用户, 因为实时计算中的 UV 通常是指当日的访客数。
代码实现:
package com.yyds.app.dwm;
import com.alibaba.fastjson.JSONAware;
import com.alibaba.fastjson.JSONObject;
import com.yyds.utils.MyKafkaUtils;
import org.apache.flink.api.common.functions.RichFilterFunction;
import org.apache.flink.api.common.state.StateTtlConfig;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.time.Time;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import java.text.SimpleDateFormat;
/**
* (1)其一,是识别出该访客打开的第一个页面,表示这个访客开始进入我们的应用
* (2)其二,由于访客可以在一天中多次进入应用,所以我们要在一天的范围内进行去重
*/
public class UniqueVisitApp {
public static void main(String[] args) throws Exception {
// TODO 1、获取执行环境
System.setProperty("HADOOP_USER_NAME","root");
// 获取执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
// 开启 Checkpoint,每隔 5 秒钟做一次 Checkpoint
env.enableCheckpointing(5000L);
//指定 CK 的一致性语义
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
// 设置超时时间
//env.getCheckpointConfig().setAlignmentTimeout(10000L);
env.getCheckpointConfig().setMaxConcurrentCheckpoints(2);
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(3000L);
// 重启策略
// env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3,5000L));
//设置任务关闭的时候保留最后一次 CK 数据
env.getCheckpointConfig().enableExternalizedCheckpoints(
CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION
);
// 设置状态后端
env.setStateBackend(new FsStateBackend("hdfs://centos01:8020/flinkCDC/ck"));
// TODO 2、获取dwd_page_log数据
String sourceTopic = "dwd_page_log";
String groupId = "unique_visit_app_2022";
String sinkTopic = "dwm_unique_visit";
FlinkKafkaConsumer<String> kafkaConsumer = MyKafkaUtils.getKafkaConsumer(sourceTopic, groupId);
DataStreamSource<String> kafkaDS = env.addSource(kafkaConsumer);
// TODO 3、将每行数据转换为json对象
SingleOutputStreamOperator<JSONObject> jsonObj = kafkaDS.map(JSONObject::parseObject);
// TODO 4、过滤数据(状态编程) 保存每个mid每一条第一次登录的数据
KeyedStream<JSONObject, String> keyedStream = jsonObj.keyBy(obj -> obj.getJSONObject("common").getString("mid"));
SingleOutputStreamOperator<JSONObject> filterStream = keyedStream.filter(new RichFilterFunction<JSONObject>() {
private ValueState<String> dateState;
private SimpleDateFormat sdf;
@Override
public void open(Configuration parameters) throws Exception {
ValueStateDescriptor<String> valueStateDescriptor = new ValueStateDescriptor<String>(
"valueStateDescriptor",
String.class
);
// 给状态设置TTL
StateTtlConfig stateTtlConfig = new StateTtlConfig
.Builder(Time.hours(24))
.setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite)
.build();
valueStateDescriptor.enableTimeToLive(stateTtlConfig);
dateState = getRuntimeContext().getState(valueStateDescriptor);
sdf = new SimpleDateFormat("yyyy-MM-dd");
}
@Override
public boolean filter(JSONObject value) throws Exception {
//取出上一条页面信息
String lastPageId = value.getJSONObject("page").getString("last_page_id");
Long ts = value.getLong("ts");
String curDate = sdf.format(ts);
if (lastPageId == null || lastPageId.length() <= 0) {
// 如果为空,就通过日期去状态里面进行判断
String dateStateValue = dateState.value();
if(!curDate.equals(dateStateValue)){
// 更新状态
dateState.update(curDate);
return true;
}else {
return false;
}
} else {
// 如果不为空就过滤
return false;
}
}
});
// TODO 5、将数据写入到kafka
SingleOutputStreamOperator<String> resStream = filterStream.map(JSONAware::toJSONString);
resStream.print("resStream----------------");
resStream.addSink(MyKafkaUtils.getKafkaProducer(sinkTopic));
//TODO 6、启动任务
env.execute("UniqueVisitApp");
}
}
(3)跳出
跳出就是用户成功访问了网站的一个页面后就退出, 不在继续访问网站的其它页面。
➢ 该页面是用户近期访问的第一个页面
这个可以通过该页面是否有上一个页面(last_page_id)来判断,如果这个表示为空,就说明这是这个访客这次访问的第一个页面。
➢ 首次访问之后很长一段时间(自己设定),用户没继续再有其他页面的访问。
代码实现
package com.yyds.app.dwm;
import com.alibaba.fastjson.JSONAware;
import com.alibaba.fastjson.JSONObject;
import com.yyds.utils.MyKafkaUtils;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.cep.CEP;
import org.apache.flink.cep.PatternSelectFunction;
import org.apache.flink.cep.PatternStream;
import org.apache.flink.cep.PatternTimeoutFunction;
import org.apache.flink.cep.pattern.Pattern;
import org.apache.flink.cep.pattern.conditions.SimpleCondition;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import org.apache.flink.util.OutputTag;
import java.time.Duration;
import java.util.List;
import java.util.Map;
/**
* 用户跳出事件,本质上就是一个条件事件加一个超时事件的组合
*
* (1)该页面是用户近期访问的第一个页面
*
* (2)首次访问之后很长一段时间(自己设定),用户没继续再有其他页面的访问。
*/
public class UserJumpDetailApp {
public static void main(String[] args) throws Exception {
// TODO 1、获取执行环境
System.setProperty("HADOOP_USER_NAME","root");
// 获取执行环境
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);// kafka分区数保存一致
// 开启 Checkpoint,每隔 5 秒钟做一次 Checkpoint
env.enableCheckpointing(5000L);
//指定 CK 的一致性语义
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
// 设置超时时间
//env.getCheckpointConfig().setAlignmentTimeout(10000L);
env.getCheckpointConfig().setMaxConcurrentCheckpoints(2);
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(3000L);
// 重启策略
// env.setRestartStrategy(RestartStrategies.fixedDelayRestart(3,5000L));
//设置任务关闭的时候保留最后一次 CK 数据
env.getCheckpointConfig().enableExternalizedCheckpoints(
CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION
);
// 设置状态后端
env.setStateBackend(new FsStateBackend("hdfs://centos01:8020/flinkCDC/ck"));
// TODO 2、获取dwd_page_log数据
String sourceTopic = "dwd_page_log";
String groupId = "user_jump_detail_app_2022";
String sinkTopic = "dwm_user_jump_detail";
FlinkKafkaConsumer<String> kafkaConsumer = MyKafkaUtils.getKafkaConsumer(sourceTopic, groupId);
DataStreamSource<String> kafkaDS = env.addSource(kafkaConsumer);
// TODO 3、将每行数据转换为JSON对象,并提取时间戳 生成watermark
SingleOutputStreamOperator<JSONObject> jsonObj = kafkaDS.map(JSONObject::parseObject)
.assignTimestampsAndWatermarks(
WatermarkStrategy.<JSONObject>forBoundedOutOfOrderness(Duration.ofSeconds(2))
.withTimestampAssigner(new SerializableTimestampAssigner<JSONObject>() {
@Override
public long extractTimestamp(JSONObject element, long recordTimestamp) {
return element.getLong("ts");
}
})
);
// TODO 4、定义模式序列
Pattern<JSONObject, JSONObject> pattern = Pattern.<JSONObject>begin("start")
.where(new SimpleCondition<JSONObject>() {
@Override
public boolean filter(JSONObject value) throws Exception {
String lastPageId = value.getJSONObject("page").getString("last_page_id");
return lastPageId == null || lastPageId.length() <= 0;
}
}).next("next")
.where(new SimpleCondition<JSONObject>() {
@Override
public boolean filter(JSONObject value) throws Exception {
String lastPageId = value.getJSONObject("page").getString("last_page_id");
return lastPageId == null || lastPageId.length() <= 0;
}
}).within(Time.seconds(10));
// Pattern.<JSONObject>begin("start")
// .where(new SimpleCondition<JSONObject>() {
// @Override
// public boolean filter(JSONObject value) throws Exception {
// String lastPageId = value.getJSONObject("page").getString("last_page_id");
// return lastPageId == null || lastPageId.length() <= 0;
// }
// })
// .times(2)
// .consecutive() // 指定严格近邻(next)
// .times(10);
// TODO 5、模式序列作用到流上
PatternStream<JSONObject> patternStream = CEP.pattern(
jsonObj.keyBy(josn -> josn.getJSONObject("common").getString("mid")),
pattern
);
// TODO 6、提取事件(匹配上的 以及 超时事件)
OutputTag<JSONObject> timeOutTag = new OutputTag<JSONObject>("timeOut"){};
SingleOutputStreamOperator<JSONObject> selectDS = patternStream.select(
timeOutTag,
new PatternTimeoutFunction<JSONObject, JSONObject>() {
@Override
public JSONObject timeout(Map<String, List<JSONObject>> map, long l) throws Exception {
return map.get("start").get(0);
}
},
new PatternSelectFunction<JSONObject, JSONObject>() {
@Override
public JSONObject select(Map<String, List<JSONObject>> map) throws Exception {
return map.get("start").get(0);
}
}
);
DataStream<JSONObject> timeOutDS = selectDS.getSideOutput(timeOutTag);
// TODO 7、union事件
DataStream<JSONObject> unionDS = selectDS.union(timeOutDS);
// TODO 8、写入到kafka
unionDS.print("unionDS-------------");
unionDS.map(JSONAware::toJSONString).addSink(MyKafkaUtils.getKafkaProducer(sinkTopic));
// TODO 9、启动任务
env.execute("UserJumpDetailApp");
}
}
1313

被折叠的 条评论
为什么被折叠?



