Learning FP-Growth Algorithm in Python

Again, it is a study note of 'Machine Learning in Action'. Here is a refined variation to Apriori principle - FP-Growth algorithm

The key data structure is Condition FP Tree - a Trie with each path as a frequency-sorted path. 

1. We count frequency of each item, and construct such a conditional FP tree. At the same time, we keep a list of all leaf nodes

2. For each leaf node (another item), we have several paths, and we generate a conditional FP-tree out of them - this is under the condition of that item(s)

3. We recursively call #2 on each item\generated conditional FP-tree


Please note that the code in the book has some defects. I fixed it as below:

#   Tree Node
#
class TreeNode:
    def __init__(self, nameValue, numOccur, parentNode):
        self.name = nameValue
        self.count= numOccur
        self.nodeLink = None # link similar nodes
        self.parent = parentNode
        self.chidren = {}

    def inc(self, numOccur):
        self.count += numOccur

    def disp(self, ind = 1): # DFS to print tree
        print (' ' * ind, self.name, ' ', self.count)
        for child in self.chidren.values():
            child.disp(ind + 1)

'''
    ======= FP-Tree Construction (like Trie) =======
'''
def createTree(dataSet, minSup = 1): # dataSet is {}
    #   Pass 1: Count frequency
    headerTable = {}
    for trans in dataSet:
        for item in trans:
            headerTable[item] = headerTable.get(item, 0) + dataSet[trans]

    #   Remove unqualified items
    keysToDel = []
    for k in headerTable.keys():
        if headerTable[k] < minSup:
            keysToDel.append(k)
    for k in keysToDel:
        headerTable.pop(k, None)

    freqItemSet = set(headerTable.keys())
    if len(freqItemSet) == 0: return None, None

    #   Add link field to headerTable and init to None
    for k in headerTable:
        headerTable[k] = [headerTable[k], None] # frequency, link to 1st item

    retTree = TreeNode('Null', 1, None)
    #   Pass 2
    for tranSet, count in dataSet.items():
        localD = {}
        for item in tranSet:
            if item in freqItemSet:
                localD[item] = headerTable[item][0] # frequent
        if len(localD) > 0:
            # sort by frequent - highest come first
            st = sorted(localD.items(), key=lambda p: p[1], reverse=True)
            orderedItems = [v[0] for v in st]
            updateTree(orderedItems, retTree, headerTable, count)
    return retTree, headerTable


def updateTree(items, inTree, headerTable, count):
    #   Iterative
    retTree = inTree
    for i in range(len(items)):
        if items[i] in inTree.chidren:
            inTree.chidren[items[i]].inc(count)
        else:
            inTree.chidren[items[i]] = TreeNode(items[i], count, inTree)
            #   Append the Linked List in headerTable
            if headerTable[items[i]][1] == None:
                headerTable[items[i]][1] = inTree.chidren[items[i]]
            else:
                updateHeader(headerTable[items[i]][1], inTree.chidren[items[i]])
        inTree = inTree.chidren[items[i]]
    inTree = retTree # return

def updateHeader(nodeToTest, targetNode): # like a linked-list of similar items
    while(nodeToTest.nodeLink != None): # go to the end of the linked-list
        nodeToTest = nodeToTest.nodeLink
    nodeToTest.nodeLink = targetNode

'''
    ======= Creating conditional FP trees =======
'''

def ascendTree(leafNode, prefixPath): # bottom up to root
    if leafNode.parent != None:
        prefixPath.append(leafNode.name)
        ascendTree(leafNode.parent, prefixPath)

def findPrefixPath(treeNode):
    condPats = {}
    while treeNode != None: # do ascending for each instance of the same type
        prefixPath = []
        ascendTree(treeNode, prefixPath)
        if len(prefixPath) > 1:
            condPats[frozenset(prefixPath[1:])] = treeNode.count
        treeNode = treeNode.nodeLink
    return condPats

'''
    ======= Mining =======
'''
def mineTree(headerTable, minSup, preFix, freqItemList, level = 0):
    #   start from lowest frequent item
    bigL = [v[0] for v in sorted(headerTable.items(), key = lambda p: p[1][0])]
    #   Based on some existing CP-tree - that is, some stat tree under some condition like p&q
    for basePat in bigL:
        newFreqSet = preFix.copy()
        newFreqSet.add(basePat)
        freqItemList.append((newFreqSet, headerTable[basePat][0])) # return: freqSet - its occurence

        condPattBases = findPrefixPath(headerTable[basePat][1])
        myCondTree, myHead = createTree(condPattBases, minSup)

        if myHead != None:
            mineTree(myHead, minSup, newFreqSet, freqItemList, level + 1)


  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
FP-growth算法是一种非常常用的关联分析算法,可以用于挖掘数据集中的频繁项集,进而发现数据集中不同项之间的关联关系。C++作为一种高效的编程语言,也可以用于实现FP-growth算法。 下面是一个基于C++类的FP-growth算法实现及案例示例: ```cpp #include <iostream> #include <fstream> #include <algorithm> #include <vector> #include <map> using namespace std; class Node { public: string name; int count; Node* parent; map<string, Node*> children; Node() { name = ""; count = 0; parent = NULL; } Node(string name, int count) { this->name = name; this->count = count; parent = NULL; } void inc(int num) { count += num; } }; class FPTree { public: Node* root; map<string, int> headerTable; FPTree() { root = new Node(); headerTable.clear(); } void insert(vector<string>& transaction) { Node* cur = root; for (int i = 0; i < transaction.size(); i++) { string item = transaction[i]; if (cur->children.count(item) == 0) { cur->children[item] = new Node(item, 1); cur->children[item]->parent = cur; if (headerTable.count(item) == 0) { headerTable[item] = 1; } else { headerTable[item]++; } } else { cur->children[item]->count++; } cur = cur->children[item]; } } }; class FPGrowth { public: FPTree* tree; map<string, int> items; vector<vector<string>> transactions; FPGrowth() { tree = NULL; } void loadTransactions(string filename) { ifstream fin(filename); if (!fin.is_open()) { return; } string line; while (getline(fin, line)) { vector<string> transaction; string item; for (int i = 0; i < line.size(); i++) { if (line[i] == ' ') { if (items.count(item) == 0) { items[item] = 1; } else { items[item]++; } transaction.push_back(item); item = ""; } else { item += line[i]; } } if (!item.empty()) { if (items.count(item) == 0) { items[item] = 1; } else { items[item]++; } transaction.push_back(item); } transactions.push_back(transaction); } fin.close(); } bool cmp(const pair<string, int>& a, const pair<string, int>& b) { return a.second > b.second; } void buildTree() { tree = new FPTree(); for (int i = 0; i < transactions.size(); i++) { vector<string>& transaction = transactions[i]; sort(transaction.begin(), transaction.end(), [&](string a, string b) { return items[a] > items[b]; }); tree->insert(transaction); } } void findPrefixPath(string item, Node* node, vector<Node*>& prefixPath) { while (node != tree->root) { if (node->name == item) { prefixPath.push_back(node); } node = node->parent; } } void mineFrequentItemsets(int minSup) { vector<pair<string, int>> freqItems; for (auto it = items.begin(); it != items.end(); it++) { if (it->second >= minSup) { freqItems.push_back(*it); } } sort(freqItems.begin(), freqItems.end(), cmp); for (int i = 0; i < freqItems.size(); i++) { vector<string> prefix; prefix.push_back(freqItems[i].first); int sup = freqItems[i].second; findPrefixPaths(prefix, tree->headerTable, sup); } } void findPrefixPaths(vector<string>& prefix, map<string, Node*> headerTable, int sup) { string item = prefix[prefix.size() - 1]; Node* node = headerTable[item]->parent; vector<Node*> prefixPath; while (node != tree->root) { prefixPath.clear(); findPrefixPath(item, node, prefixPath); vector<string> subPrefix; for (int i = 0; i < prefix.size() - 1; i++) { subPrefix.push_back(prefix[i]); } subPrefix.push_back(node->name); int count = node->count; for (int i = 0; i < prefixPath.size(); i++) { count = min(count, prefixPath[i]->count); } if (count >= sup) { cout << "{"; for (int i = 0; i < subPrefix.size(); i++) { cout << subPrefix[i] << " "; } cout << item << "} : " << count << endl; findPrefixPaths(subPrefix, node->children, sup); } node = node->parent; } } }; int main() { FPGrowth fpg; fpg.loadTransactions("transactions.txt"); fpg.buildTree(); fpg.mineFrequentItemsets(2); return 0; } ``` 上述代码实现了一个基于类的FP-growth算法,并且支持从文件中加载交易数据,并挖掘出频繁项集。其中,`Node`类表示FP树中的节点,`FPTree`类表示FP树,`FPGrowth`类表示FP-growth算法。具体实现细节可以参考代码注释。 需要注意的是,本示例中的实现仅支持从文件中加载交易数据,并不支持在线实时插入交易数据,如果需要支持在线插入数据,需要对代码进行一定的修改。另外,本示例中的实现也没有进行过多的优化,不适用于大型数据集的挖掘。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值