最近公司项目上用到频繁项发现算法,于是就用java实现了一个fp-growth算法实现。
环境说明 | 版本说明 | 备注 |
操作系统 | debian 9 | 无 |
jdk | openjdk 1.8 | 无 |
关于fp-growth算法的原理请参考:
https://www.cnblogs.com/pinard/p/6307064.html 和《机器学习实战》。
FpTreeNode类
package com.slyk.sdp.algorithms.externalAlgorithms.fpTree; import java.util.ArrayList; import java.util.List; /** * 描述:fpTree树节点 * * @param <T> * * @author <a href='mailto:xiaomingyang@shulianyikang.com'>xiaomingyang</a> * @created on 2019年5月23日,下午8:01:46 */ public class FpTreeNode<T> { /** * 当前节点频繁度 */ private long count = 0; /** * 节点内容值 */ private T nodeVal; /** * 父类节点 */ private FpTreeNode<T> parent = null; /** * 当前节点子节点 */ private List<FpTreeNode<T>> children = null; /** * helper */ private FpTreeHelper<T> helper = null; public FpTreeNode(long count, T nodeVal, FpTreeNode<T> parent, List<FpTreeNode<T>> children, FpTreeHelper<T> helper) { super(); this.count = count; this.nodeVal = nodeVal; this.parent = parent; this.children = children; this.helper = helper; } /** * 描述:添加子节点 * * @param child * @return 被添加的子节点 * @author <a href='mailto:xiaomingyang@shulianyikang.com'>xiaomingyang</a> * @created on 2019年5月23日,下午7:33:13 */ public FpTreeNode<T> addChild(FpTreeNode<T> child) { if (this.getChildren() == null) { children = new ArrayList<FpTreeNode<T>>(); } child.setParent(this); this.children.add(child); return child; } /** * 描述:向当前节点添加路径 * <br/> * List结构数据前一项为后一项数据父节点,例:<br/> * a,b,c,d</br> * <table border="1px" cellspacing="0px"> * <tr><th>节点</th><th>父节点</th></tr> * <tr><td>a</td><td>null</td></tr> * <tr><td>b</td><td>a</td></tr> * <tr><td>c</td><td>b</td></tr> * <tr><td>d</td><td>c</td></tr> * </table> * * @param path 树的一条路径,是某个事物下的数据记录列表 * @param parentNode 路径第一个节点的父节点 * @return * @author <a href='mailto:xiaomingyang@shulianyikang.com'>xiaomingyang</a> * @created on 2019年5月25日,下午9:42:41 */ public void addPath(List<T> path, FpTreeNode<T> parentNode) { if (path == null || path.size() == 0) { return ; } T firstEl = path.get(0); if (parentNode != null && helper.nodeCompare(firstEl, parentNode.getNodeVal())) { parentNode.increaseCountOne(); parentNode.addPath(path.subList(1, path.size()), parentNode); } else { FpTreeNode<T> fnode = new FpTreeNode<T>(1, firstEl, null, null, this.getHelper()); FpTreeNode<T> exsistChild = this.findChild(fnode.getNodeVal()); if (exsistChild != null) { exsistChild.increaseCountOne(); exsistChild.addPath(path.subList(1, path.size()), exsistChild); } else { FpTreeNode<T> node = this.addChild(fnode); node.addPath(path.subList(1, path.size()), node); } } } /** * 描述:计数器加一 * * @return 当前节点计数器 * @author <a href='mailto:xiaomingyang@shulianyikang.com'>xiaomingyang</a> * @created on 2019年5月23日,下午7:36:21 */ public long increaseCountOne() { return this.increaseCount(1); } /** * 描述: * * @param increasement * @return 当前节点计数器 * @author <a href='mailto:xiaomingyang@shulianyikang.com'>xiaomingyang</a> * @created on 2019年5月23日,下午7:37:16 */ public long increaseCount(long increasement) { this.count += increasement; return this.count; } /** * 描述: 当前节点寻找指定子节点,有,则返回节点,无则返回null * * @param childVal * @return * @author <a href='mailto:xiaomingyang@shulianyikang.com'>xiaomingyang</a> * @created on 2019年5月23日,下午7:41:42 */ public FpTreeNode<T> findChild(T childVal) { if (children == null) { return null; } for (FpTreeNode<T> child : children) { if (helper.nodeCompare(child.getNodeVal(), childVal)) { return child; } } return null; } @Override public String toString() { return super.toString() + "-node (val:" + this.getNodeVal() + ", count: " + this.getCount() + ")"; } public long getCount() { return count; } public void setCount(long count) { this.count = count; } public T getNodeVal() { return nodeVal; } public void setNodeVal(T nodeVal) { this.nodeVal = nodeVal; } public FpTreeNode<T> getParent() { return parent; } public void setParent(FpTreeNode<T> parent) { this.parent = parent; } public List<FpTreeNode<T>> getChildren() { return children; } public void setChildren(List<FpTreeNode<T>> children) { this.children = children; } public FpTreeHelper<T> getHelper() { return helper; } public void setHelper(FpTreeHelper<T> helper) { this.helper = helper; } }
FpTreeHeader类
package com.slyk.sdp.algorithms.externalAlgorithms.fpTree; import java.util.ArrayList; import java.util.Comparator; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.LinkedList; import java.util.List; import java.util.Map; import java.util.Set; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.util.Assert; import com.slyk.sdp.algorithms.externalAlgorithms.fpTree.util.ListSortUtils; /** * 描述:fptree项头表 * * @param <K> * * @author <a href='mailto:xiaomingyang@shulianyikang.com'>xiaomingyang</a> * @created on 2019年5月23日,下午8:05:14 */ @SuppressWarnings("hiding") public class FpTreeHeader<K, Integer> extends LinkedHashMap <K, java.lang.Integer> { private static Logger logger = LoggerFactory.getLogger(FpTreeHeader.class); private static final long serialVersionUID = 1L; /** * 过滤、排序后的原始数据,用以做构建fptree输入数据 */ private List<List<K>> inputData = new LinkedList<List<K>>(); /** * helper */ private FpTreeHelper<K> helper; /** * 节点链,fptree构建后依据项头表建立的节点链列表 */ private Map<K, List<FpTreeNode<K>>> treeNodeMap = new LinkedHashMap<K, List<FpTreeNode<K>>>(); /** * 描述:添加helper * * @param helper * @return * @author <a href='mailto:xiaomingyang@shulianyikang.com'>xiaomingyang</a> * @created on 2019年5月29日,上午10:54:18 */ public FpTreeHeader<K, Integer> addHelper( FpTreeHelper<K> helper) { this.setHelper(helper); return this; } /** * 描述: 构建节点链列表 * * @param node * @author <a href='mailto:xiaomingyang@shulianyikang.com'>xiaomingyang</a> * Created On 2019年5月29日, 上午1:13:27 */ protected void buildNodeEntryList(FpTreeNode<K> node) { if (node.getCount() != -1) { List<FpTreeNode<K>> nodeList = treeNodeMap.get(node.getNodeVal()); if (nodeList == null) { nodeList = new ArrayList<FpTreeNode<K>>(); nodeList.add(node); treeNodeMap.put(node.getNodeVal(), nodeList); } else { nodeList.add(node); } } if (node.getChildren() == null) { return ; } for (FpTreeNode<K> child : node.getChildren()) { buildNodeEntryList(child); } } /** * 描述:构建项头表 * * @param sourceData * @param absSupport * @return * @author <a href='mailto:xiaomingyang@shulianyikang.com'>xiaomingyang</a> * @created on 2019年5月23日,下午8:36:58 */ @SuppressWarnings("unchecked") public FpTreeHeader<K, Integer> buildTable(List<List<K>> sourceData, int absSupport) { Assert.notNull(this.helper, "helper cannot be null, Set helper first!"); logger.debug("构建项头表."); for (List<K> data : sourceData) { for (K k : data) { if (this.get(k) == null) { this.put(k, 1); } else { this.put(k, this.get(k) + 1); } } } // 过滤不满足项目 Set<java.util.Map.Entry<K, java.lang.Integer>> set = this.entrySet(); Iterator<java.util.Map.Entry<K, java.lang.Integer>> ite = set.iterator(); while (ite.hasNext()) { java.util.Map.Entry<K, java.lang.Integer> entry = ite.next(); if (entry.getValue() < absSupport) { ite.remove(); } } // 项头表排序 List<K> keylist = new ArrayList<K>(this.keySet()); Map<K, Integer> thisRef = (Map<K, Integer>) new LinkedHashMap<String, Integer>(); ListSortUtils.sort(keylist, this.getHelper().nodeEleCompare((FpTreeHeader<K, java.lang.Integer>) this)); for (K k : keylist) { thisRef.put(k, (Integer) this.get(k)); } this.clear(); this.putAll((Map<? extends K, ? extends java.lang.Integer>) thisRef); // 对原始输入数据过滤并排序 for (List<K> data : sourceData) { for (Iterator<K> itr = data.iterator(); itr.hasNext(); ) { K k = itr.next(); if (!this.containsKey(k)) { itr.remove(); } } FpTreeHeader<K, java.lang.Integer> _this = (FpTreeHeader<K, java.lang.Integer>) this; ListSortUtils.sort(data, new Comparator<K>() { @Override public int compare(K o1, K o2) { int i = _this.get(o2) - _this.get(o1); if (i == 0) { Iterator<java.util.Map.Entry<K, java.lang.Integer>> itr = _this.entrySet().iterator(); int index1 = 0; int index2 = 0; for (int a = 0,b = 0; itr.hasNext(); ) { a = a + 1; b = b + 1; java.util.Map.Entry<K, java.lang.Integer> entry = itr.next(); if (helper.nodeCompare(entry.getKey(), o1)) { index1 = a; } else if (helper.nodeCompare(entry.getKey(), o2)) { index2 = b; } } i = index1 - index2; } return i; } }); if (!data.isEmpty()) { inputData.add(data); } } sourceData = null; logger.debug("构建项头表完成."); return this; } public List<List<K>> getInputData() { return inputData; } public void setInputData(List<List<K>> inputData) { this.inputData = inputData; } public FpTreeHelper<K> getHelper() { return helper; } public void setHelper(FpTreeHelper<K> helper) { this.helper = helper; } public Map<K, List<FpTreeNode<K>>> getTreeNodeMap() { return treeNodeMap; } public void setTreeNodeMap(Map<K, List<FpTreeNode<K>>> treeNodeMap) { this.treeNodeMap = treeNodeMap; } }
FpTree类:
package com.slyk.sdp.algorithms.externalAlgorithms.fpTree; import java.io.ByteArrayInputStream; import java.io.ByteArrayOutputStream; import java.io.IOException; import java.io.ObjectInputStream; import java.io.ObjectOutputStream; import java.util.ArrayList; import java.util.Collections; import java.util.Iterator; import java.util.LinkedHashMap; import java.util.List; import java.util.ListIterator; import java.util.Map; import java.util.Set; import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.springframework.util.Assert; import com.slyk.sdp.algorithms.externalAlgorithms.fpTree.util.DoubleKeyMap; /** * FPtree * * 描述:@param <T> * * @author <a href='mailto:xiaomingyang@shulianyikang.com'>xiaomingyang</a> * @created on 2019年6月3日,下午1:34:22 */ public class FpTree<T> { private static Logger logger = LoggerFactory.getLogger(FpTree.class); /** * 项头表 */ private FpTreeHeader<T, Integer> fpTreeHeader; /** * helper */ private FpTreeHelper<T> helper; /** * root node */ private FpTreeNode<T> root; /** * 默认频繁度阈值 */ protected static final int DEFAULT_ABS_SUPPORT = 0xf; private int absSupport = DEFAULT_ABS_SUPPORT; /** * 默认置信度 */ private static final int DEFAULT_CONFIDENT = 3; /** * 置信度 */ private int confident = DEFAULT_CONFIDENT; /** * 描述:挖掘树 * <br/>代码参考自《机器学习实战》 * * @param outList * @param tree * @param basePat * @return * @throws ClassNotFoundException * @throws IOException * @author <a href='mailto:xiaomingyang@shulianyikang.com'>xiaomingyang</a> * @created on 2019年5月31日,下午5:50:45 */ public List<List<T>> fpGrowth(List<List<T>> outList, FpTree<T> tree, List<T> prefix) throws ClassNotFoundException, IOException { logger.debug("开始conditionFpTree数据挖掘计算."); // // 挖掘频繁项集的步骤如下: // 1 从FP树提取条件模式基