[Python]FPG(FP-growth)算法核心实现

FPG是FP-growth算法的简称,推荐算法=》关联算法中最有名的算法之一,是Apriori算法的性能优化版。

参考了一些示例,自行再实现,具体算法如下。


步骤归纳为:
1、第一次遍历获取HeaderTable,包括去重、计频繁数、依据最小支持度去项、重排序(频繁数倒序);
2、第二次遍历更新原列表,包括依据headerTable去除小于最小支持度的项、重排序
3、建FP Tree,包括创建新节点、相似元素项节点合并


调用入口 (test04.py):

#coding:utf-8
import test03
oneDimList = []
def loadSimpDat():
    simpDat = [['r', 'z', 'h', 'j', 'p'],
               ['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
               ['z'],
               ['r', 'x', 'n', 'o', 's'],
               ['y', 'r', 'x', 'z', 'q', 't', 'p'],
               ['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
    return simpDat

# 数据准备
simpDat = loadSimpDat()
for list in simpDat:
    oneDimList += list
# 建header table
headerTable = test03.HeaderTable().create(oneDimList, 3)
# 建FP tree
buildTree = test03.BuildTree()
updDat = buildTree.refactor(simpDat, headerTable)
fpTree = buildTree.update(updDat)
# 打印结果
for fpTreeItem in fpTree:
    print 'parent:' + fpTreeItem.parent + ', name:' + fpTreeItem.name + \
          ', num:' + str(fpTreeItem.numOccur)


核心实现 (test03.py):

#coding:utf-8
class HeaderTable:
    def __init__(self):
        pass

    def create(self, dat, minsup):
        headerTable = {}
        # 去重
        setDat = set(dat)
        # 计频繁数
        for key in setDat:
            headerTable[key] = dat.count(key)
        # 依据最小支持度去项
        for k,v in headerTable.items():
            if v < minsup:
                del(headerTable[k])
        # 重排序
        headerTable = sorted(headerTable.items(),key=lambda i:i[1],reverse=True)
        print headerTable
        return headerTable

class FPTreeItem:
    def __init__(self, key, name, numOccur, parent):
        self.key = key              # key
        self.name = name            # 项名
        self.numOccur = numOccur    # 频繁值
        self.parent = parent        # 父节点

class BuildTree:

    # inDat: (list) [[],[]]
    def refactor(self, inDat, headerTable):
        lineCounter = 0
        datLine =[]
        dat = []

        # 依据headerTable去除小于最小支持度的项、重排序
        for list in inDat:
            lineCounter += 1
            for i in headerTable:
                if i[0] in list:
                    datLine.append(i[0])
            dat.append(datLine)
            datLine = []
        return dat

    # updDat: (list) [[],[]]
    def update(self, updDat):
        fpTree = []

        for list in updDat:
            parent = ''
            keyLink = ''
            for item in list:
                parent = keyLink
                keyLink += item

                for fpTreeItem in fpTree:
                    if keyLink == fpTreeItem.key:
                        # 相似元素项节点合并
                        fpTreeItem.numOccur += 1
                        break
                # 没有这个元素项时创建一个新节点
                else:
                    fpTreeItem = FPTreeItem(keyLink, item, 1, parent)
                    fpTree.append(fpTreeItem)
        return fpTree


参考:

http://www.cnblogs.com/zhangchaoyang/articles/2198946.html
http://www.cnblogs.com/qwertWZ/p/4510857.html

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
FP-growth算法是一种非常常用的关联分析算法,可以用于挖掘数据集中的频繁项集,进而发现数据集中不同项之间的关联关系。C++作为一种高效的编程语言,也可以用于实现FP-growth算法。 下面是一个基于C++类的FP-growth算法实现及案例示例: ```cpp #include <iostream> #include <fstream> #include <algorithm> #include <vector> #include <map> using namespace std; class Node { public: string name; int count; Node* parent; map<string, Node*> children; Node() { name = ""; count = 0; parent = NULL; } Node(string name, int count) { this->name = name; this->count = count; parent = NULL; } void inc(int num) { count += num; } }; class FPTree { public: Node* root; map<string, int> headerTable; FPTree() { root = new Node(); headerTable.clear(); } void insert(vector<string>& transaction) { Node* cur = root; for (int i = 0; i < transaction.size(); i++) { string item = transaction[i]; if (cur->children.count(item) == 0) { cur->children[item] = new Node(item, 1); cur->children[item]->parent = cur; if (headerTable.count(item) == 0) { headerTable[item] = 1; } else { headerTable[item]++; } } else { cur->children[item]->count++; } cur = cur->children[item]; } } }; class FPGrowth { public: FPTree* tree; map<string, int> items; vector<vector<string>> transactions; FPGrowth() { tree = NULL; } void loadTransactions(string filename) { ifstream fin(filename); if (!fin.is_open()) { return; } string line; while (getline(fin, line)) { vector<string> transaction; string item; for (int i = 0; i < line.size(); i++) { if (line[i] == ' ') { if (items.count(item) == 0) { items[item] = 1; } else { items[item]++; } transaction.push_back(item); item = ""; } else { item += line[i]; } } if (!item.empty()) { if (items.count(item) == 0) { items[item] = 1; } else { items[item]++; } transaction.push_back(item); } transactions.push_back(transaction); } fin.close(); } bool cmp(const pair<string, int>& a, const pair<string, int>& b) { return a.second > b.second; } void buildTree() { tree = new FPTree(); for (int i = 0; i < transactions.size(); i++) { vector<string>& transaction = transactions[i]; sort(transaction.begin(), transaction.end(), [&](string a, string b) { return items[a] > items[b]; }); tree->insert(transaction); } } void findPrefixPath(string item, Node* node, vector<Node*>& prefixPath) { while (node != tree->root) { if (node->name == item) { prefixPath.push_back(node); } node = node->parent; } } void mineFrequentItemsets(int minSup) { vector<pair<string, int>> freqItems; for (auto it = items.begin(); it != items.end(); it++) { if (it->second >= minSup) { freqItems.push_back(*it); } } sort(freqItems.begin(), freqItems.end(), cmp); for (int i = 0; i < freqItems.size(); i++) { vector<string> prefix; prefix.push_back(freqItems[i].first); int sup = freqItems[i].second; findPrefixPaths(prefix, tree->headerTable, sup); } } void findPrefixPaths(vector<string>& prefix, map<string, Node*> headerTable, int sup) { string item = prefix[prefix.size() - 1]; Node* node = headerTable[item]->parent; vector<Node*> prefixPath; while (node != tree->root) { prefixPath.clear(); findPrefixPath(item, node, prefixPath); vector<string> subPrefix; for (int i = 0; i < prefix.size() - 1; i++) { subPrefix.push_back(prefix[i]); } subPrefix.push_back(node->name); int count = node->count; for (int i = 0; i < prefixPath.size(); i++) { count = min(count, prefixPath[i]->count); } if (count >= sup) { cout << "{"; for (int i = 0; i < subPrefix.size(); i++) { cout << subPrefix[i] << " "; } cout << item << "} : " << count << endl; findPrefixPaths(subPrefix, node->children, sup); } node = node->parent; } } }; int main() { FPGrowth fpg; fpg.loadTransactions("transactions.txt"); fpg.buildTree(); fpg.mineFrequentItemsets(2); return 0; } ``` 上述代码实现了一个基于类的FP-growth算法,并且支持从文件中加载交易数据,并挖掘出频繁项集。其中,`Node`类表示FP树中的节点,`FPTree`类表示FP树,`FPGrowth`类表示FP-growth算法。具体实现细节可以参考代码注释。 需要注意的是,本示例中的实现仅支持从文件中加载交易数据,并不支持在线实时插入交易数据,如果需要支持在线插入数据,需要对代码进行一定的修改。另外,本示例中的实现也没有进行过多的优化,不适用于大型数据集的挖掘。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值