class treeNode:
def __init__(self, nameValue, numOccur, parentNode):
self.name = nameValue
self.count = numOccur
self.nodeLink = None
self.parent = parentNode
self.children = {}
def inc(self, numOccur):
self.count += numOccur
def disp(self, ind = 1):
print(' ' * ind, self.name, ' ', self.count)
for child in self.children.values():
child.disp(ind + 1)
def createTree(dataSet, minSup=1):
headerTable = {}
for trans in dataSet: # trans是事务,就是一条['r', 'z', 'h', 'j', 'p'],这个循环是取出dataSet中的键
for item in trans: # item是元素,就是上面事务中的r、z、h等
# dataSet[trans]是取出dataSet键值对中的值,这个值在前面处理的时候都为1
headerTable[item] = headerTable.get(item, 0) + dataSet[trans]
# 这里和书上不同(书上是Python2)。字典不能在遍历时删除元素,所以转换成list遍历
for k in list(headerTable.keys()):
if headerTable[k] < minSup:
headerTable.pop(k)
print(headerTable)
freqItemSet = set(headerTable.keys()) # 建立一个set,去重
if len(freqItemSet) == 0:
return None, None # 如果没有元素项,则退出。到这里,才对数据进行过滤完毕
# 第二次遍历,获得头指针表。headerTable:{K: [val(出现的次数), None], K: [val(出现的次数), None]}
# 这个None应该会放在树中k对象的链表
for k in headerTable:
headerTable[k] = [headerTable[k], None]
# 树的初始化
retTree = treeNode('Null Set', 1, None)
# 字典的遍历。dataSet是原始数据
for tranSet, count in dataSet.items():
locaD = {}
for item in tranSet:
if item in freqItemSet: # freqItemSet已经去除掉不符合条件的key集合(set)
locaD[item] = headerTable[item][0]
if len(locaD) > 0: # 只要找到有符合条件的key
# 此处的lambada p: p[1]就是指定比价locaD中的val来进行比较,降序。将locaD中的key放入orderedItems列表中
orderedItems = [v[0] for v in sorted(locaD.items(), key=lambda p: p[1], reverse=True)]
updateTree(orderedItems, retTree, headerTable, count)
return retTree, headerTable
# 树生长 (当前事务中合法的元素降序排列, 树的父节点, 头指针表, 1)
def updateTree(items, inTree, headerTable, count):
if items[0] in inTree.children:
inTree.children[items[0]].inc(count) # 该元素在父节点之下,则更新当前子节点的计数,增加1
else:
# 不在的话,就增加一个子节点
inTree.children[items[0]] = treeNode(items[0], count, inTree)
# 用==,Python报警告
# 如果头指针表的当前元素没有头指针,则直接赋值。否则使用updateHeader更新。
if headerTable[items[0]][1] is None:
headerTable[items[0]][1] = inTree.children[items[0]]
else:
updateHeader(headerTable[items[0]][1], inTree.children[items[0]])
if len(items) > 1:
updateTree(items[1:], inTree.children[items[0]], headerTable, count)
# 递归找到链表尾端
def updateHeader(nodeToTest, targetNode):
while nodeToTest.nodeLink is not None:
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode
def loadSimpDat():
simpDat = [
['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']
]
return simpDat
def createInitSet(dataSet):
retDict = {}
for trans in dataSet:
retDict[frozenset(trans)] = 1
return retDict
《机器学习实战》李锐版十二章“FP-growth”算法代码解析
最新推荐文章于 2023-01-11 02:04:56 发布