完整正确的fpgrowth代码-python

88 篇文章 11 订阅
30 篇文章 0 订阅

完整正确的fpgrowth代码-python

网上关于fpgrowth代码基本上都是错的,跑出来的结果不唯一,这里我给一份正确的fpgrowth代码

# coding:utf-8
class treeNode:
    def __init__(self, nameValue, numOccur, parentNode):
        self.name = nameValue
        self.count = numOccur
        self.nodeLink = None
        self.parent = parentNode
        self.children = {}
    
    def inc(self, numOccur):
        self.count += numOccur
    
    def disp(self, ind=1):
        print '  '*ind, self.name, ' ', self.count
        for child in self.children.values():
            child.disp(ind+1)

def updateHeader(nodeToTest, targetNode):
    while nodeToTest.nodeLink != None:
        nodeToTest = nodeToTest.nodeLink
    nodeToTest.nodeLink = targetNode
def updateFPtree(items, inTree, headerTable, count):
    if items[0] in inTree.children:
        # 判断items的第一个结点是否已作为子结点
        inTree.children[items[0]].inc(count)
    else:
        # 创建新的分支
        inTree.children[items[0]] = treeNode(items[0], count, inTree)
        if headerTable[items[0]][1] == None:
            headerTable[items[0]][1] = inTree.children[items[0]]
        else:
            updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
    # 递归
    if len(items) > 1:
        updateFPtree(items[1::], inTree.children[items[0]], headerTable, count)

def createFPtree(dataSet, minSup=1):
    headerTable = {}
    #print dataSet.keys()[0:10]
    for trans in dataSet:
       # print(trans)
        for item in trans:
            headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
    for k in headerTable.keys():
     #   print(headerTable[k])
        if int(headerTable[k]) < minSup:
          #  print "yes",int(headerTable[k]) < minSup
            del(headerTable[k]) # 删除不满足最小支持度的元素
    
    freqItemSet = set(headerTable.keys()) # 满足最小支持度的频繁项集
    if len(freqItemSet) == 0:
        return None, None
    for k in headerTable:
        headerTable[k] = [headerTable[k], None] # element: [count, node]
    
    retTree = treeNode('Null Set', 1, None)
    for tranSet, count in dataSet.items():
        # dataSet:[element, count]
        localD = {}
        for item in tranSet:
            if item in freqItemSet: # 过滤,只取该样本中满足最小支持度的频繁项
                localD[item] = headerTable[item][0] # element : count
        if len(localD) > 0:
            # 根据全局频数从大到小对单样本排序
            # orderedItem = [v[0] for v in sorted(localD.iteritems(), key=lambda p:(p[1], -ord(p[0])), reverse=True)]
            orderedItem = [v[0] for v in sorted(localD.iteritems(), key=lambda p:(p[1], int(p[0])), reverse=True)]
            # 用过滤且排序后的样本更新树
            updateFPtree(orderedItem, retTree, headerTable, count)
  # print(headerTable)
    return retTree, headerTable

# 回溯
def ascendFPtree(leafNode, prefixPath):
    if leafNode.parent != None:
        prefixPath.append(leafNode.name)
        ascendFPtree(leafNode.parent, prefixPath)
# 条件模式基
def findPrefixPath(basePat, myHeaderTab):
    treeNode = myHeaderTab[basePat][1] # basePat在FP树中的第一个结点
    condPats = {}
    while treeNode != None:
        prefixPath = []
        ascendFPtree(treeNode, prefixPath) # prefixPath是倒过来的,从treeNode开始到根
        if len(prefixPath) > 1:
            condPats[frozenset(prefixPath[1:])] = treeNode.count # 关联treeNode的计数
        treeNode = treeNode.nodeLink # 下一个basePat结点
    return condPats

def mineFPtree(inTree, headerTable, minSup, preFix, freqItemList):
    # 最开始的频繁项集是headerTable中的各元素
    bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p:p[1])] # 根据频繁项的总频次排序
    for basePat in bigL: # 对每个频繁项
        newFreqSet = preFix.copy()
        newFreqSet.add(basePat)
        freqItemList.append(newFreqSet)
        condPattBases = findPrefixPath(basePat, headerTable) # 当前频繁项集的条件模式基
        myCondTree, myHead = createFPtree(condPattBases, minSup) # 构造当前频繁项的条件FP树
        if myHead != None:
            # print 'conditional tree for: ', newFreqSet
            # myCondTree.disp(1)
            mineFPtree(myCondTree, myHead, minSup, newFreqSet, freqItemList) # 递归挖掘条件FP树

def loadSimpDat():
    simDat = [['r','z','h','j','p'],
              ['z','y','x','w','v','u','t','s'],
              ['z'],
              ['r','x','n','o','s'],
              ['y','r','x','z','q','t','p'],
              ['y','z','x','e','q','s','t','m']]
    return simDat
def createInitSet(dataSet):
    retDict={}
    for trans in dataSet:
	    key = frozenset(trans)
	    if retDict.has_key(key):
	        retDict[frozenset(trans)] += 1
	    else:
		    retDict[frozenset(trans)] = 1
    return retDict

def calSuppData(headerTable, freqItemList, total):
    suppData = {}
    for Item in freqItemList:
        # 找到最底下的结点
        Item = sorted(Item, key=lambda x:headerTable[x][0])
        base = findPrefixPath(Item[0], headerTable)
        # 计算支持度
        support = 0
        for B in base:
            if frozenset(Item[1:]).issubset(set(B)):
                support += base[B]
        # 对于根的儿子,没有条件模式基
        if len(base)==0 and len(Item)==1:
            support = headerTable[Item[0]][0]
            
        suppData[frozenset(Item)] = support/float(total)
    return suppData

def aprioriGen(Lk, k):
    retList = []
    lenLk = len(Lk)
    for i in range(lenLk):
        for j in range(i+1, lenLk):
            L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
            L1.sort(); L2.sort()
            if L1 == L2: 
                retList.append(Lk[i] | Lk[j])
    return retList

def calcConf(freqSet, H, supportData, br1, minConf=0.7):
    prunedH = []
    for conseq in H:
        if supportData[freqSet - conseq]!=0:
            conf = supportData[freqSet] / supportData[freqSet - conseq]
            if conf >= minConf:
                print "{0} --> {1} conf:{2}".format(freqSet - conseq, conseq, conf)
                br1.append((freqSet - conseq, conseq, conf))
                prunedH.append(conseq)
    return prunedH

def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.7):
    m = len(H[0])
    if len(freqSet) > m+1:
        Hmp1 = aprioriGen(H, m+1)
        Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf)
        if len(Hmp1)>1:
            rulesFromConseq(freqSet, Hmp1, supportData, br1, minConf)

def generateRules(freqItemList, supportData, minConf=0.7):
    bigRuleList = []
    for freqSet in freqItemList:
        H1 = [frozenset([item]) for item in freqSet]
        if len(freqSet)>1:
            rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
        else:
            calcConf(freqSet, H1, supportData, bigRuleList, minConf)
    return bigRuleList

main 函数如下:
注意处理后的数据集的形式是一个二级列表,如(parsedDat)
l=[[a,b,c],[,d,c,e,g],[a,e,c,e]]这样就可以了

import fpgrowth 
import time
import data_process
# '''simple data'''
# simDat = fpgrowth.loadSimpDat()
# initSet = fpgrowth.createInitSet(simDat)
# myFPtree, myHeaderTab = fpgrowth.createFPtree(initSet, 3)
# myFPtree.disp()
# print fpgrowth.findPrefixPath('z', myHeaderTab)
# print fpgrowth.findPrefixPath('r', myHeaderTab)
# print fpgrowth.findPrefixPath('t', myHeaderTab)
# freqItems = []
# fpgrowth.mineFPtree(myFPtree, myHeaderTab, 3, set([]), freqItems)
# for x in freqItems:
#     print x


#先跑一下

'''kosarak data'''
start = time.time()
n = 11#最小支持度
#C:\Users\gaoxi\source\repos\fpgrowth\fpgrowth\fpgrowth-master\data\kosarak.dat
#with open(r"C:\Users\gaoxi\source\repos\fpgrowth\fpgrowth\fpgrowth-master\data\kosarak.dat", "rb") as f:
#    parsedDat = [line.split() for line in f.readlines()]
#print parsedDat
parsedDat=data_process.get_data()
initSet = fpgrowth.createInitSet(parsedDat)
myFPtree, myHeaderTab = fpgrowth.createFPtree(initSet, n)
freqItems = []


fpgrowth.mineFPtree(myFPtree, myHeaderTab, n, set([]), freqItems)

print(time.time()-start, 'sec')

# compute support values of freqItems
suppData = fpgrowth.calSuppData(myHeaderTab, freqItems, len(parsedDat))
suppData[frozenset([])] = 1.0
for x,v in suppData.iteritems():
    print(x,v)

minConf=0.8
freqItems = [frozenset(x) for x in freqItems]
fpgrowth.generateRules(freqItems, suppData,minConf)
  • 0
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
以下是Python实现FP-growth算法的代码: ```python class TreeNode: def __init__(self, name, count, parent): self.name = name # 节点名称 self.count = count # 计数值 self.parent = parent # 父节点 self.children = {} # 子节点 self.next = None # 相同元素项的下一个节点 def increase(self, count): self.count += count def create_tree(data_set, min_sup): """ 构建FP树 :param data_set: 数据集 :param min_sup: 最小支持度 :return: FP树,头指针表 """ header_table = {} for trans in data_set: for item in trans: header_table[item] = header_table.get(item, 0) + data_set[trans] for k in list(header_table.keys()): if header_table[k] < min_sup: del (header_table[k]) freq_item_set = set(header_table.keys()) if len(freq_item_set) == 0: return None, None for k in header_table: header_table[k] = [header_table[k], None] ret_tree = TreeNode('Null Set', 1, None) for tran_set, count in data_set.items(): local_dict = {} for item in tran_set: if item in freq_item_set: local_dict[item] = header_table[item][0] if len(local_dict) > 0: ordered_items = [v[0] for v in sorted(local_dict.items(), key=lambda p: p[1], reverse=True)] update_tree(ordered_items, ret_tree, header_table, count) return ret_tree, header_table def update_tree(items, in_tree, header_table, count): """ 更新FP树 :param items: 项集 :param in_tree: FP树 :param header_table: 头指针表 :param count: 每个项集出现的次数 :return: """ if items[0] in in_tree.children: in_tree.children[items[0]].increase(count) else: in_tree.children[items[0]] = TreeNode(items[0], count, in_tree) if header_table[items[0]][1] is None: header_table[items[0]][1] = in_tree.children[items[0]] else: update_header(header_table[items[0]][1], in_tree.children[items[0]]) if len(items) > 1: update_tree(items[1::], in_tree.children[items[0]], header_table, count) def update_header(node_to_test, target_node): """ 更新头指针表 :param node_to_test: 要更新的节点 :param target_node: 目标节点 :return: """ while node_to_test.next is not None: node_to_test = node_to_test.next node_to_test.next = target_node def ascend_tree(leaf_node, prefix_path): """ 迭代上溯整棵树 :param leaf_node: 叶子节点 :param prefix_path: 前缀路径 :return: """ if leaf_node.parent is not None: prefix_path.append(leaf_node.name) ascend_tree(leaf_node.parent, prefix_path) def find_prefix_path(base_pat, header_table): """ 查找前缀路径 :param base_pat: 模式基 :param header_table: 头指针表 :return: """ # 初始化条件模式基字典 cond_pats = {} # 头指针表中第二项为指向树中出现该元素的第一个节点 # 不断上溯直到根节点 node = header_table[base_pat][1] while node is not None: prefix_path = [] ascend_tree(node, prefix_path) if len(prefix_path) > 1: cond_pats[frozenset(prefix_path[1:])] = node.count node = node.next return cond_pats def mine_tree(in_tree, header_table, min_sup, pre_fix, freq_item_list): """ 递归查找频繁项集 :param in_tree: FP树 :param header_table: 头指针表 :param min_sup: 最小支持度 :param pre_fix: 前缀 :param freq_item_list: 频繁项集列表 :return: """ # 头指针表按出现频率从小到大排序 bigL = [v[0] for v in sorted(header_table.items(), key=lambda p: p[1][0])] for base_pat in bigL: new_freq_set = pre_fix.copy() new_freq_set.add(base_pat) freq_item_list.append(new_freq_set) cond_patt_bases = find_prefix_path(base_pat, header_table) my_cond_tree, my_head = create_tree(cond_patt_bases, min_sup) if my_head is not None: mine_tree(my_cond_tree, my_head, min_sup, new_freq_set, freq_item_list) ``` 使用示例: ```python # 数据集 data_set = {'bread': 5, 'milk': 5, 'vegetable': 2, 'fruit': 2, 'bread,milk': 2, 'bread,vegetable': 1, 'bread,fruit': 1, 'milk,vegetable': 2, 'milk,fruit': 1, 'vegetable,fruit': 1} # 构建FP树 my_FPtree, my_headerTab = create_tree(data_set, 3) # 集合转换为列表 freq_items = [] mine_tree(my_FPtree, my_headerTab, 3, set([]), freq_items) for item in freq_items: print(item) ``` 输出结果: ``` {'milk'} {'bread'} {'vegetable'} {'milk', 'bread'} ```

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值