关键词: process算子、TopN、TreeMap、valueState、mapState、listState
public KeyedProcessFunction createKeyedProcessFunction(long threshold) {
return new KeyedProcessFunction<Tuple, FlowTuple11, FlowTuple11>(){
private long threshold;
private ValueState<TreeMap<Long, FlowTuple11>> valueState = null;
public KeyedProcessFunction accept(long threshold){
this.threshold = threshold;
return this;
}
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
ValueStateDescriptor<TreeMap<Long, FlowTuple11>> valueStateDesc = new ValueStateDescriptor("topn-state", TreeMap.class);
valueState = getRuntimeContext().getState(valueStateDesc);
}
@Override
public void processElement(FlowTuple11 value, Context ctx, Collector<FlowTuple11> out) throws Exception {
TreeMap<Long, FlowTuple11> top = valueState.value();
if (top == null) {
top = new TreeMap<Long, FlowTuple11>(
new Comparator<Long>() {
//treemap按照key降序排列,相同value值不覆盖
@Override
public int compare(Long y, Long x) {
return (x < y) ? -1 : 1;
}
});
}
top.put(value.getBps(), value);
// 采集topN
if (top.size() > threshold) {
top.pollLastEntry();
}
valueState.update(top);
ctx.timerService().registerEventTimeTimer(value.getBizTime());
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<FlowTuple11> out) throws Exception {
TreeMap<Long, FlowTuple11> top = valueState.value();
for (Map.Entry<Long, FlowTuple11> entry : top.entrySet()) {
out.collect(entry.getValue());
}
valueState.clear();
}
}.accept(threshold);
}
RocksDB 场景,MapState 比 ValueState性能高很多;而heap模式场景,二者性能差不多。(实测MapState还是比ValueState高一些)。
public KeyedProcessFunction createKeyedProcessFunction(long threshold) {
return new KeyedProcessFunction<Tuple, FlowTuple11, FlowTuple11>(){
private long threshold;
private MapState<String, TreeMap<Long, FlowTuple11>> mapState = null;
public KeyedProcessFunction accept(long threshold){
this.threshold = threshold;
return this;
}
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
MapStateDescriptor<String, TreeMap<Long, FlowTuple11>> mapStateDesc = new MapStateDescriptor("topn-state", String.class, TreeMap.class);
mapState = getRuntimeContext().getMapState(mapStateDesc);
}
@Override
public void processElement(FlowTuple11 value, Context ctx, Collector<FlowTuple11> out) throws Exception {
TopMap<Long, FlowTuple11> top = mapState.get("topn-state");
if (top == null) {
top = new TreeMap<Long, FlowTuple11>(
new Comparator<Long>() {
//treemap按照key降序排列,相同value值不覆盖
@Override
public int compare(Long y, Long x) {
return (x < y) ? -1 : 1;
}
});
}
top.put(value.getBps(), value);
// 采集topN
if (top.size() > threshold) {
top.pollLastEntry();
}
mapState.put("topn-state", top);
ctx.timerService().registerEventTimeTimer(value.getBizTime());
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<FlowTuple11> out) throws Exception {
TopMap<Long, FlowTuple11> top = mapState.get("topn-state");
for (Map.Entry<Long, FlowTuple11> entry : top.entrySet()) {
out.collect(entry.getValue());
}
mapState.clear();
}
}.accept(threshold);
}
上面两种实现依然还存在性能问题,原因是当数据量很大时,每处理一个数据都需要先从stateBackend中读取出TreeMap对象,更新后再写回去。上面两种设计的初衷是为了避免缓存完所有数据再进行TopN排序,是为了减少内存消耗(只需要缓存下TopN的数据),但是这种大对象的存取反而增加了性能消耗,最终造成了数据积压,进而进程崩溃。
最后再尝试使用listState实现,先缓存数据,再TopN排序。
public KeyedProcessFunction createKeyedProcessFunction(long threshold) {
return new KeyedProcessFunction<Tuple, FlowTuple11, FlowTuple11>(){
private long threshold;
private transient ListState<FlowTuple11> itemState;
public KeyedProcessFunction accept(long threshold){
this.threshold = threshold;
return this;
}
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
ListStateDescriptor<FlowTuple11> itemsStateDesc = new ListStateDescriptor("topn-state, FlowTuple11.class);
itemState = getRuntimeContext().getListState(itemsStateDesc);
}
@Override
public void processElement(FlowTuple11 value, Context ctx, Collector<FlowTuple11> out) throws Exception {
itemState.add(value);
ctx.timerService().registerEventTimeTimer(value.getBizTime());
}
@Override
public void onTimer(long timestamp, OnTimerContext ctx, Collector<FlowTuple11> out) throws Exception {
TreeMap<Long, FlowTuple11> top = new TreeMap<Long, FlowTuple11>(
new Comparator<Long>() {
//treemap按照key降序排列,相同value值不覆盖
@Override
public int compare(Long y, Long x) {
return (x < y) ? -1 : 1;
}
});
for (FlowTuple11 item:itemState.get()) {
top.put(item.getBps(), item);
// 输出topN
if (top.size() > threshold) {
top.pollLastEntry();
}
}
for (Map.Entry<Long, FlowTuple11> entry : top.entrySet()) {
out.collect(entry.getValue());
}
itemState.clear();
}
}.accept(threshold);
}
实测性能最佳。