Flink项目之统计网站热门商品
项目介绍
每五分钟统计电商网站一小时内的热门商品
数据格式
543462,1715,1464116,pv,1511658000 543462,1715,1464116,pv,1511658000 662867,2244074,1575622,pv,1511658000 分别对应 userId,itemId,categoryId,behavior,timestamp
涉及内容
窗口、状态
相关支持
Flink 1.12.5 java 8
项目所需依赖
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>org.example</groupId>
<artifactId>com.louxun.UserBehaviorAnalysis</artifactId>
<packaging>pom</packaging>
<version>1.0-SNAPSHOT</version>
<modules>
<module>HotItemsAnalysis</module>
<module>NetworkFlowAnalysis</module>
</modules>
<properties>
<flink.version>1.12.5</flink.version>
<scala.binary.version>2.12</scala.binary.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-java</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-streaming-java_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.apache.flink</groupId>
<artifactId>flink-clients_${scala.binary.version}</artifactId>
<version>${flink.version}</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<version>1.18.20</version>
</dependency>
</dependencies>
</project>
目录结构
编码
- UserBehaviorEntity
package com.louxun.entity;
import lombok.Data;
@Data
public class UserBehaviorEntity {
private Long userId;
private Long itemId;
private Long categoryId;
private String behavior;
private Long timestamp;
public UserBehaviorEntity(Long userId, Long itemId, Long categoryId, String behavior, Long timestamp) {
this.userId = userId;
this.itemId = itemId;
this.categoryId = categoryId;
this.behavior = behavior;
this.timestamp = timestamp;
}
}
- UserBehaviorEntity
package com.louxun.entity;
import lombok.Data;
import java.io.Serializable;
@Data
public class ItemViewCountEntiy implements Serializable {
private Long itemId;
private Long windowEnd;
private Long count;
}
- HotItemsTask
package com.louxun.task;
import com.louxun.customFunction.CountAgg;
import com.louxun.customFunction.TopNHotItems;
import com.louxun.customFunction.WindowResultFunction;
import com.louxun.entity.UserBehaviorEntity;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import java.time.Duration;
/**
* 一小时内热门商品统计 ,每五分钟计算一次
*/
public class HotItemsTask {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
String path = "D:\\flink_demo\\UserBehaviorAnalysis\\HotItemsAnalysis\\src\\main\\resources\\UserBehavior.csv";
// 获取数据转换格式 分配时间戳
SingleOutputStreamOperator<UserBehaviorEntity> map = env.readTextFile(path)
.map(line -> {
String[] words = line.split(",");
return new UserBehaviorEntity(new Long(words[0]), new Long(words[1]), new Long(words[2]), words[3], new Long(words[4]));
}).assignTimestampsAndWatermarks(
WatermarkStrategy
.<UserBehaviorEntity>forBoundedOutOfOrderness(Duration.ofSeconds(2)) // 设置水位线
.withTimestampAssigner((event, timestamp) -> event.getTimestamp() * 1000L)
);
// 1.10版本实现
// SingleOutputStreamOperator<UserBehaviorEntity> map = env.readTextFile(path)
// .map(line -> {
// String[] words = line.split(",");
// return new UserBehaviorEntity(new Long(words[0]), new Long(words[1]), new Long(words[2]), words[3], new Long(words[4]));
// }).assignTimestampsAndWatermarks(
// new AscendingTimestampExtractor<UserBehaviorEntity>() {
// @Override
// public long extractAscendingTimestamp(UserBehaviorEntity element) {
// return element.getTimestamp() * 1000L;
// }
// }
// );
// 过滤掉无用数据后进行分组
map.filter(entity -> "pv".equals(entity.getBehavior()))
.keyBy(entity -> entity.getItemId())
// .timeWindow(Time.hours(1), Time.minutes(5))
.window(SlidingEventTimeWindows.of(Time.hours(1), Time.minutes(5)))
.aggregate(new CountAgg(), new WindowResultFunction())
.keyBy( item -> item.getWindowEnd())
.process(new TopNHotItems())
.print();
env.execute("hot items list");
}
}
- CountAgg
package com.louxun.customFunction;
import com.louxun.entity.UserBehaviorEntity;
import org.apache.flink.api.common.functions.AggregateFunction;
public class CountAgg implements AggregateFunction<UserBehaviorEntity, Long, Long> {
@Override
public Long createAccumulator() {
return 0l;
}
@Override
public Long add(UserBehaviorEntity value, Long accumulator) {
return accumulator + 1;
}
@Override
public Long getResult(Long accumulator) {
return accumulator;
}
@Override
public Long merge(Long a, Long b) {
return a + b;
}
}
- TopNHotItems
package com.louxun.customFunction;
import com.louxun.entity.ItemViewCountEntiy;
import org.apache.commons.compress.utils.Lists;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.List;
public class TopNHotItems extends KeyedProcessFunction<Long, ItemViewCountEntiy, String> {
// 定义初始变量
private ListState<ItemViewCountEntiy> itemState;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
itemState = getRuntimeContext().getListState(new ListStateDescriptor<ItemViewCountEntiy>("itemState-state", ItemViewCountEntiy.class));
}
@Override
public void processElement(ItemViewCountEntiy value, Context ctx, Collector<String> out) throws Exception {
itemState.add(value);
ctx.timerService().registerEventTimeTimer(value.getWindowEnd() + 1);
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
super.onTimer(timestamp, ctx, out);
ArrayList<ItemViewCountEntiy> list = Lists.newArrayList(itemState.get().iterator());
// List<ItemViewCountEntiy> list = new ArrayList<>();
// for (ItemViewCountEntiy entiy : itemState.get()) {
// list.add(entiy);
// }
itemState.clear();
list.sort(new Comparator<ItemViewCountEntiy>() {
@Override
public int compare(ItemViewCountEntiy o1, ItemViewCountEntiy o2) {
long num = o2.getCount() - o1.getCount();
// return o2.getCount().intValue()-o1.getCount().intValue();
if (num > 0) {
return 1;
} else if (num < 0) {
return -1;
}
return 0;
}
});
StringBuilder result = new StringBuilder();
for (int i = 0; i < 5; i++) {
ItemViewCountEntiy entiy = list.get(i);
result.append("No").append(i + 1).append(":")
.append(" 商品ID=").append(entiy.getItemId())
.append(" 浏览量=").append(entiy.getCount()).append("\n");
}
// for (ItemViewCountEntiy entiy : resultList) {
// result.append("No").append(i+1).append(":")
// .append(" 商品ID=").append(entiy.getItemId())
// .append(" 浏览量=").append(entiy.getCount()).append("\n");
//
// }
result.append("====================================\n\n");
// 控制输出频率,模拟实时滚动结果
Thread.sleep(1000);
out.collect(result.toString());
}
}
- WindowResultFunction
package com.louxun.customFunction;
import com.louxun.entity.ItemViewCountEntiy;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
public class WindowResultFunction implements WindowFunction<Long, ItemViewCountEntiy, Long, TimeWindow> {
@Override
public void apply(Long aLong, TimeWindow window, Iterable<Long> input, Collector<ItemViewCountEntiy> out) throws Exception {
ItemViewCountEntiy entiry = new ItemViewCountEntiy();
entiry.setItemId(aLong);
entiry.setCount(input.iterator().next());
entiry.setWindowEnd(window.getEnd());
out.collect(entiry);
}
}