package org.apache.mahout.fpm.pfpgrowth.fpgrowth;
public class Pattern implements Comparable<Pattern>
pattern封装了一组item,每个item的support值,整体的support值
判断pattern是否包含。由于pattern已经是升序排列,两个数组比较推进即可。
包含有几个个条件:1.本数组长度要小外来 2.如果外来数组已经大于本数组当前值则不可能3.要么两个数组都达到末尾,要么外来数组没有到末尾
扩容。增大1.5倍再复制之前的。
增加一个item,总support取所有item最小值
package org.apache.mahout.fpm.pfpgrowth.fpgrowth;
public final class FrequentPatternMaxHeap
在堆中保存top k个pattern
核心结构
同一support值的pattern组成一个set
判断是否可添加
如果容量未满直接可添加,如果已满,若support值大于当前最小也可以添加
插入新pattern,如果堆满了,存在替换过程,如果有子pattern检查还要维护同support的集合。insert函数主要进行插入前的条件筛选,主要逻辑。
具体的插入过程。检查子集,如果堆中已经有超集则不插入。
public class Pattern implements Comparable<Pattern>
pattern封装了一组item,每个item的support值,整体的support值
private int[] pattern;
private long support = Long.MAX_VALUE;
private long[] supportValues;
判断pattern是否包含。由于pattern已经是升序排列,两个数组比较推进即可。
包含有几个个条件:1.本数组长度要小外来 2.如果外来数组已经大于本数组当前值则不可能3.要么两个数组都达到末尾,要么外来数组没有到末尾
public final boolean isSubPatternOf(Pattern frequentPattern) {
int[] otherPattern = frequentPattern.getPattern();
int otherLength = frequentPattern.length();
if (this.length() > frequentPattern.length()) {
return false;
}
int i = 0;
int otherI = 0;
while (i < length && otherI < otherLength) {
if (otherPattern[otherI] == pattern[i]) {
otherI++;
i++;
} else if (otherPattern[otherI] < pattern[i]) {
otherI++;
} else {
return false;
}
}
return otherI != otherLength || i == length;
}
扩容。增大1.5倍再复制之前的。
private void resize() {
int size = (int) (GROWTH_RATE * length);
if (size < DEFAULT_INITIAL_SIZE) {
size = DEFAULT_INITIAL_SIZE;
}
int[] oldpattern = pattern;
long[] oldSupport = supportValues;
this.pattern = new int[size];
this.supportValues = new long[size];
System.arraycopy(oldpattern, 0, this.pattern, 0, length);
System.arraycopy(oldSupport, 0, this.supportValues, 0, length);
}
增加一个item,总support取所有item最小值
public final void add(int id, long supportCount) {
dirty = true;
if (length >= pattern.length) {
resize();
}
this.pattern[length] = id;
this.supportValues[length++] = supportCount;
this.support = supportCount > this.support ? this.support : supportCount;
}
package org.apache.mahout.fpm.pfpgrowth.fpgrowth;
public final class FrequentPatternMaxHeap
在堆中保存top k个pattern
核心结构
private final PriorityQueue<Pattern> queue;
同一support值的pattern组成一个set
private final OpenLongObjectHashMap<Set<Pattern>> patternIndex;
判断是否可添加
如果容量未满直接可添加,如果已满,若support值大于当前最小也可以添加
public boolean addable(long support) {
return count < maxSize || least.support() <= support;
}
插入新pattern,如果堆满了,存在替换过程,如果有子pattern检查还要维护同support的集合。insert函数主要进行插入前的条件筛选,主要逻辑。
public void insert(Pattern frequentPattern) {
if (frequentPattern.length() == 0) {
return;
}
if (count == maxSize) {
if (frequentPattern.compareTo(least) > 0 && addPattern(frequentPattern)) {
Pattern evictedItem = queue.poll();
least = queue.peek();
if (subPatternCheck) {
patternIndex.get(evictedItem.support()).remove(evictedItem);
}
}
} else {
if (addPattern(frequentPattern)) {
count++;
if (least == null) {
least = frequentPattern;
} else {
if (least.compareTo(frequentPattern) < 0) {
least = frequentPattern;
}
}
}
}
}
具体的插入过程。检查子集,如果堆中已经有超集则不插入。
private boolean addPattern(Pattern frequentPattern) {
if (subPatternCheck) {
Long index = frequentPattern.support();
if (patternIndex.containsKey(index)) {
Set<Pattern> indexSet = patternIndex.get(index);
boolean replace = false;
Pattern replacablePattern = null;
for (Pattern p : indexSet) {
if (frequentPattern.isSubPatternOf(p)) {
return false;
} else if (p.isSubPatternOf(frequentPattern)) {
replace = true;
replacablePattern = p;
break;
}
}
if (replace) {
indexSet.remove(replacablePattern);
if (!indexSet.contains(frequentPattern) && queue.add(frequentPattern)) {
indexSet.add(frequentPattern);
}
return false;
}
queue.add(frequentPattern);
indexSet.add(frequentPattern);
} else {
queue.add(frequentPattern);
Set<Pattern> patternList;
if (!patternIndex.containsKey(index)) {
patternList = new HashSet<Pattern>();
patternIndex.put(index, patternList);
}
patternList = patternIndex.get(index);
patternList.add(frequentPattern);
}
} else {
queue.add(frequentPattern);
}
return true;
}