网上的 python3 fp-growth代码每次在执行时可能会出现找出的频繁项集不一致的情况,这是因为每次执行代码时建的FP树可能不一致。
加了一行代码可以解决这个问题(第59行):先对 frequentItemsInRecord 按 key 的ASSIC码排序,然后再按照 key 的支持度(即value值)降序排列。
之所以这么做是因为 frequentItemsInRecord 中可能会出现支持度一样的项,如果不按ASSIC码先排一次的话,
有可能出现每次执行代码时 orderedFrequentItems (第60行)中相同支持度的项出现的顺序不一致,从而造成每次建的FP树不一致,导致找出的频繁项集不一致。
import pprint
def loadDataSet():
dataSet = [['bread', 'milk', 'vegetable', 'fruit', 'eggs'],
['noodle', 'beef', 'pork', 'water', 'socks', 'gloves', 'shoes', 'rice'],
['socks', 'gloves'],
['bread', 'milk', 'shoes', 'socks', 'eggs'],
['socks', 'shoes', 'sweater', 'cap', 'milk', 'vegetable', 'gloves'],
['eggs', 'bread', 'milk', 'fish', 'crab', 'shrimp', 'rice']]
return dataSet
def transfer2FrozenDataSet(dataSet):
frozenDataSet = {}
for elem in dataSet:
frozenDataSet[frozenset(elem)] = 1
return frozenDataSet
class TreeNode:
def __init__(self, nodeName, count, nodeParent):
self.nodeName = nodeName
self.count = count
self.nodeParent = nodeParent
self.nextSimilarItem = None
self.children = {}
def increaseC(self, count):
self.count += count
def createFPTree(frozenDataSet, minSupport):
# scan dataset at the first time, filter out items which are less than minSupport
headPointTable = {}
for items in frozenDataSet:
for item in items:
headPointTable[item] = headPointTable.get(item, 0) + frozenDataSet[items]
headPointTable = {
k: v
for k, v in headPointTable.items() if v >= minSupport
}
frequentItems = set(headPointTable.keys())
if len(frequentItems) == 0: return None, None
for k in headPointTable:
headPointTable[k] = [headPointTable[k], None]
fptree = TreeNode("null", 1, None)
# scan dataset at the second time, filter out items for each record
for items, count in frozenDataSet.items():
frequentItemsInRecord = {}
for item in items:
if item in frequentItems:
frequentItemsInRecord[item] = headPointTable[item][0]
if len(frequentItemsInRecord) > 0:
frequentItemsInRecord = sorted(frequentItemsInRecord.items(), key=lambda v: v[0])
orderedFrequentItems = [v[0] for v in sorted(frequentItemsInRecord, key=lambda v: v[1], reverse=True)]
updateFPTree(fptree, orderedFrequentItems, headPointTable, count)
return fptree, headPointTable
def updateFPTree(fptree, orderedFrequentItems, headPointTable, count):
# handle the first item
if orderedFrequentItems[0] in fptree.children:
fptree.children[orderedFrequentItems[0]].increaseC(count)
else:
fptree.children[orderedFrequentItems[0]] = TreeNode(orderedFrequentItems[0], count, fptree)
# update headPointTable
if headPointTable[orderedFrequentItems[0]][1] == None:
headPointTable[orderedFrequentItems[0]][1] = fptree.children[orderedFrequentItems[0]]
else:
updateHeadPointTable(headPointTable[orderedFrequentItems[0]][1], fptree.children[orderedFrequentItems[0]])
# handle other items except the first item
if (len(orderedFrequentItems) > 1):
updateFPTree(fptree.children[orderedFrequentItems[0]], orderedFrequentItems[1::], headPointTable, count)
def updateHeadPointTable(headPointBeginNode, targetNode):
while (headPointBeginNode.nextSimilarItem != None):
headPointBeginNode = headPointBeginNode.nextSimilarItem
headPointBeginNode.nextSimilarItem = targetNode
def mineFPTree(headPointTable, prefix, frequentPatterns, minSupport):
# for each item in headPointTable, find conditional prefix path, create conditional fptree,
# then iterate until there is only one element in conditional fptree
headPointItems = [v[0] for v in sorted(headPointTable.items(), key=lambda v: v[1][0])]
if (len(headPointItems) == 0): return
for headPointItem in headPointItems:
newPrefix = prefix.copy()
newPrefix.add(headPointItem)
support = headPointTable[headPointItem][0]
frequentPatterns[frozenset(newPrefix)] = support
prefixPath = getPrefixPath(headPointTable, headPointItem)
if (prefixPath != {}):
conditionalFPtree, conditionalHeadPointTable = createFPTree(prefixPath, minSupport)
if conditionalHeadPointTable != None:
mineFPTree(conditionalHeadPointTable, newPrefix, frequentPatterns, minSupport)
def getPrefixPath(headPointTable, headPointItem):
prefixPath = {}
beginNode = headPointTable[headPointItem][1]
prefixs = ascendTree(beginNode)
if ((prefixs != [])):
prefixPath[frozenset(prefixs)] = beginNode.count
while (beginNode.nextSimilarItem != None):
beginNode = beginNode.nextSimilarItem
prefixs = ascendTree(beginNode)
if (prefixs != []):
prefixPath[frozenset(prefixs)] = beginNode.count
return prefixPath
def ascendTree(treeNode):
prefixs = []
while ((treeNode.nodeParent != None) and (treeNode.nodeParent.nodeName != 'null')):
treeNode = treeNode.nodeParent
prefixs.append(treeNode.nodeName)
return prefixs
def rulesGenerator(frequentPatterns, minConf, rules):
for frequentset in frequentPatterns:
if (len(frequentset) > 1):
getRules(frequentset, frequentset, rules, frequentPatterns, minConf)
def removeStr(set, str):
tempSet = []
for elem in set:
if (elem != str):
tempSet.append(elem)
tempFrozenSet = frozenset(tempSet)
return tempFrozenSet
def getRules(frequentset, currentset, rules, frequentPatterns, minConf):
for frequentElem in currentset:
subSet = removeStr(currentset, frequentElem)
confidence = frequentPatterns[frequentset] / frequentPatterns[subSet]
if (confidence >= minConf):
flag = False
for rule in rules:
if (rule[0] == subSet and rule[1] == frequentset - subSet):
flag = True
if (flag == False):
rules.append((subSet, frequentset - subSet, confidence))
if (len(subSet) >= 2):
getRules(frequentset, subSet, rules, frequentPatterns, minConf)
if __name__ == '__main__':
dataSet = loadDataSet()
frozenDataSet = transfer2FrozenDataSet(dataSet)
minSupport = 3
fptree, headPointTable = createFPTree(frozenDataSet, minSupport)
frequentPatterns = {}
prefix = set([])
mineFPTree(headPointTable, prefix, frequentPatterns, minSupport)
print("frequent patterns:")
pprint.pprint(frequentPatterns)
minConf = 0.6
rules = []
rulesGenerator(frequentPatterns, minConf, rules)
print("association rules:")
pprint.pprint(rules)
print('rules num:', len(rules))