简介
这里主要的问题就是三条流union操作的时候,所有的指标一起累加处理,还有就是数据的延迟的问题,可以调高窗口的延迟时间
工具类
kafkaSink
public class MyKafkaUtil {
static String BOOTSTRAP_SERVERS = "master:9092, node1:9092, node2:9092";
static String DEFAULT_TOPIC = "default_topic";
/**
* 根据主题还有消费者组得到消费者
*
* @param topic
* @param groupId
* @return
*/
public static FlinkKafkaConsumer<String> getKafkaConsumer(String topic, String groupId) {
Properties prop = new Properties();
prop.setProperty("bootstrap.servers", BOOTSTRAP_SERVERS);
prop.setProperty(ConsumerConfig.GROUP_ID_CONFIG, groupId);
FlinkKafkaConsumer<String> consumer = new FlinkKafkaConsumer<>(topic,
//由于默认的解码器,如果字符串为空的时候他会保存,所以自定义一个
new KafkaDeserializationSchema<String>() {
@Override
public boolean isEndOfStream(String nextElement) {
return false;
}
@Override
public String deserialize(ConsumerRecord<byte[], byte[]> record) throws Exception {
if (record == null || record.value() == null) {
return "";
}
return new String(record.value());
}
@Override
public TypeInformation<String> getProducedType() {
return BasicTypeInfo.STRING_TYPE_INFO;
}
}, prop);
return consumer;
}
/**
* 根据主题得到生产者
*
* @param topic
* @return
*/
public static FlinkKafkaProducer<String> getKafkaProducer(String topic) {
Properties prop = new Properties();
prop.setProperty("bootstrap.servers", BOOTSTRAP_SERVERS);
prop.setProperty(ProducerConfig.TRANSACTION_TIMEOUT_CONFIG, 60 * 15 * 1000 + "");
FlinkKafkaProducer<String> producer = new FlinkKafkaProducer<String>(DEFAULT_TOPIC, new KafkaSerializationSchema<String>() {
@Override
public ProducerRecord<byte[], byte[]> serialize(String jsonStr, @Nullable Long timestamp) {
return new ProducerRecord<byte[], byte[]>(topic, jsonStr.getBytes());
}
}, prop,
FlinkKafkaProducer.Semantic.EXACTLY_ONCE);
return producer;
}
}
@Data
@AllArgsConstructor
public class TrafficPageViewBean {
// 窗口起始时间
String stt;
// 窗口结束时间
String edt;
// app 版本号
String vc;
// 渠道
String ch;
// 地区
String ar;
// 新老访客状态标记
String isNew ;
// 独立访客数
Long uvCt;
// 会话数
Long svCt;
// 页面浏览数
Long pvCt;
// 累计访问时长
Long durSum;
// 跳出会话数
Long ujCt;
// 时间戳
Long ts;
}
clickhouse工具类
public class ClickHouseUtil {
// ClickHouse 驱动
public static final String CLICKHOUSE_DRIVER = "ru.yandex.clickhouse.ClickHouseDriver";
// ClickHouse 连接 URL,gmall_rebuild是数据库create database gmall_rebuild;
public static final String CLICKHOUSE_URL = "jdbc:clickhouse://master:8123/gmall_rebuild";
public static <T> SinkFunction<T> getJdbcSink(String sql) {
return JdbcSink.<T>sink(
sql,
new JdbcStatementBuilder<T>() {
@Override
public void accept(PreparedStatement preparedStatement, T obj) throws SQLException {
Field[] declaredFields = obj.getClass().getDeclaredFields();
int skipNum = 0;
for (int i = 0; i < declaredFields.length; i++) {
Field declaredField = declaredFields[i];
//使用这个自定义注解的作用是如果有些字段是 不想保存的时候可以标记它
TransientSink transientSink = declaredField.getAnnotation(TransientSink.class);
if (transientSink != null) {
skipNum++;
continue;
}
declaredField.setAccessible(true);
try {
Object value = declaredField.get(obj);
preparedStatement.setObject(i + 1 - skipNum, value);
} catch (IllegalAccessException e) {
System.out.println("ClickHouse 数据插入 SQL 占位符传参异常 ~");
e.printStackTrace();
}
}
}
},
JdbcExecutionOptions.builder()
.withBatchIntervalMs(5000L)
.withBatchSize(5)
.build(),
new JdbcConnectionOptions.JdbcConnectionOptionsBuilder()
.withDriverName(CLICKHOUSE_DRIVER)
.withUrl(CLICKHOUSE_URL)
.build()
);
}
}
功能实现
建表语句
drop table if exists dws_traffic_vc_ch_ar_is_new_page_view_window;
create table if not exists dws_traffic_vc_ch_ar_is_new_page_view_window
(
stt DateTime,
edt DateTime,
vc String,
ch String,
ar String,
is_new String,
uv_ct UInt64,
sv_ct UInt64,
pv_ct UInt64,
dur_sum UInt64,
uj_ct UInt64,
ts UInt64
) engine = ReplacingMergeTree(ts)
partition by toYYYYMMDD(stt)
order by (stt, edt, vc, ch, ar, is_new);
应用代码
public class DwsTrafficVcChArIsNewPageViewWindow {
public static void main(String[] args) throws Exception {
// TODO 1. 环境准备
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(4);
// TODO 2. 状态后端设置
env.enableCheckpointing(3000L, CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setCheckpointTimeout(30 * 1000L);
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(3000L);
env.getCheckpointConfig().enableExternalizedCheckpoints(
CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION
);
env.setRestartStrategy(RestartStrategies.failureRateRestart(
3, Time.days(1), Time.minutes(1)
));
env.setStateBackend(new HashMapStateBackend());
env.getCheckpointConfig().setCheckpointStorage(
"hdfs://hadoop102:8020/ck"
);
System.setProperty("HADOOP_USER_NAME", "atguigu");
// TODO 3. 从 kafka dwd_traffic_page_log 主题读取页面数据,封装为流
String topic = "dwd_traffic_page_log";
String groupId = "dws_traffic_channel_page_view_window";
FlinkKafkaConsumer<String> kafkaConsumer = MyKafkaUtil.getKafkaConsumer(topic, groupId);
DataStreamSource<String> pageLogSource = env.addSource(kafkaConsumer);
// TODO 4. 转换页面流数据结构
SingleOutputStreamOperator<JSONObject> jsonObjStream = pageLogSource.map(JSON::parseObject);
// TODO 5. 统计会话数、页面浏览数、页面访问时长,并封装为实体类
SingleOutputStreamOperator<TrafficPageViewBean> mainStream = jsonObjStream.map(
new MapFunction<JSONObject, TrafficPageViewBean>() {
@Override
public TrafficPageViewBean map(JSONObject jsonObj) throws Exception {
JSONObject common = jsonObj.getJSONObject("common");
JSONObject page = jsonObj.getJSONObject("page");
// 获取 ts
Long ts = jsonObj.getLong("ts");
// 获取维度信息
String vc = common.getString("vc");
String ch = common.getString("ch");
String ar = common.getString("ar");
String isNew = common.getString("is_new");
// 获取页面访问时长
Long duringTime = page.getLong("during_time");
// 定义变量接受其它度量值
Long uvCt = 0L;
Long svCt = 0L;
Long pvCt = 1L;
Long ujCt = 0L;
// 判断本页面是否开启了一个新的会话
String lastPageId = page.getString("last_page_id");
if (lastPageId == null) {
svCt = 1L;
}
// 封装为实体类
TrafficPageViewBean trafficPageViewBean = new TrafficPageViewBean(
"",
"",
vc,
ch,
ar,
isNew,
uvCt,
svCt,
pvCt,
duringTime,
ujCt,
ts
);
return trafficPageViewBean;
}
}
);
// TODO 6. 从 Kafka 读取跳出明细数据和独立访客数据,封装为流并转换数据结构,合并三条流
// 6.1 从 Kafka dwd_traffic_user_jump_detail 读取跳出明细数据,封装为流
String ujdTopic = "dwd_traffic_user_jump_detail";
FlinkKafkaConsumer<String> ujdKafkaConsumer = MyKafkaUtil.getKafkaConsumer(ujdTopic, groupId);
DataStreamSource<String> ujdSource = env.addSource(ujdKafkaConsumer);
SingleOutputStreamOperator<TrafficPageViewBean> ujdMappedStream =
ujdSource.map(jsonStr -> {
JSONObject jsonObj = JSONObject.parseObject(jsonStr);
JSONObject common = jsonObj.getJSONObject("common");
Long ts = jsonObj.getLong("ts") + 10 * 1000L;
// 获取维度信息
String vc = common.getString("vc");
String ch = common.getString("ch");
String ar = common.getString("ar");
String isNew = common.getString("is_new");
// 封装为实体类
return new TrafficPageViewBean(
"",
"",
vc,
ch,
ar,
isNew,
0L,
0L,
0L,
0L,
1L,
ts
);
});
// 6.2 从 Kafka dwd_traffic_unique_visitor_detail 主题读取独立访客数据,封装为流
String uvTopic = "dwd_traffic_unique_visitor_detail";
FlinkKafkaConsumer<String> uvKafkaConsumer = MyKafkaUtil.getKafkaConsumer(uvTopic, groupId);
DataStreamSource<String> uvSource = env.addSource(uvKafkaConsumer);
SingleOutputStreamOperator<TrafficPageViewBean> uvMappedStream =
uvSource.map(jsonStr -> {
JSONObject jsonObj = JSON.parseObject(jsonStr);
JSONObject common = jsonObj.getJSONObject("common");
Long ts = jsonObj.getLong("ts");
// 获取维度信息
String vc = common.getString("vc");
String ch = common.getString("ch");
String ar = common.getString("ar");
String isNew = common.getString("is_new");
// 封装为实体类
return new TrafficPageViewBean(
"",
"",
vc,
ch,
ar,
isNew,
1L,
0L,
0L,
0L,
0L,
ts
);
});
// 6.3 合并三条流
DataStream<TrafficPageViewBean> pageViewBeanDS = mainStream
.union(ujdMappedStream)
.union(uvMappedStream);
// TODO 7. 设置水位线
SingleOutputStreamOperator<TrafficPageViewBean> withWatermarkStream = pageViewBeanDS.assignTimestampsAndWatermarks(
WatermarkStrategy
//TODO 这里如果不设置延迟会使得有些消息消费不到
.<TrafficPageViewBean>forBoundedOutOfOrderness(Duration.ofSeconds(10L))
.withTimestampAssigner(
new SerializableTimestampAssigner<TrafficPageViewBean>() {
@Override
public long extractTimestamp(TrafficPageViewBean trafficPageViewBean, long recordTimestamp) {
return trafficPageViewBean.getTs();
}
}
)
);
// TODO 8. 按照维度分组
KeyedStream<TrafficPageViewBean, Tuple4<String, String, String, String>> keyedBeanStream = withWatermarkStream.keyBy(trafficPageViewBean ->
Tuple4.of(
trafficPageViewBean.getVc(),
trafficPageViewBean.getCh(),
trafficPageViewBean.getAr(),
trafficPageViewBean.getIsNew()
)
, Types.TUPLE(Types.STRING, Types.STRING, Types.STRING, Types.STRING)
);
// TODO 9. 开窗
WindowedStream<TrafficPageViewBean, Tuple4<String, String, String, String>, TimeWindow> windowStream = keyedBeanStream.window(TumblingEventTimeWindows.of(
org.apache.flink.streaming.api.windowing.time.Time.seconds(10L)))
.allowedLateness(org.apache.flink.streaming.api.windowing.time.Time.seconds(10L));
// TODO 10. 聚合计算
SingleOutputStreamOperator<TrafficPageViewBean> reducedStream = windowStream.reduce(
new ReduceFunction<TrafficPageViewBean>() {
@Override
public TrafficPageViewBean reduce(TrafficPageViewBean value1, TrafficPageViewBean value2) throws Exception {
value1.setUvCt(value1.getUvCt() + value2.getUvCt());
value1.setSvCt(value1.getSvCt() + value2.getSvCt());
value1.setPvCt(value1.getPvCt() + value2.getPvCt());
value1.setDurSum(value1.getDurSum() + value2.getDurSum());
value1.setUjCt(value1.getUjCt() + value2.getUjCt());
return value1;
}
},
new ProcessWindowFunction<TrafficPageViewBean, TrafficPageViewBean, Tuple4<String, String, String, String>, TimeWindow>() {
@Override
public void process(Tuple4<String, String, String, String> key, Context context, Iterable<TrafficPageViewBean> elements, Collector<TrafficPageViewBean> out) throws Exception {
String stt = DateFormatUtil.toYmdHms(context.window().getStart());
String edt = DateFormatUtil.toYmdHms(context.window().getEnd());
for (TrafficPageViewBean element : elements) {
element.setStt(stt);
element.setEdt(edt);
element.setTs(System.currentTimeMillis());
out.collect(element);
}
}
}
);
// TODO 11. 写入 OLAP 数据库
reducedStream.addSink(ClickHouseUtil.<TrafficPageViewBean>getJdbcSink(
"insert into dws_traffic_vc_ch_ar_is_new_page_view_window values(?,?,?,?,?,?,?,?,?,?,?,?)"
));
env.execute();
}
}
674

被折叠的 条评论
为什么被折叠?



