本文主要记录本人在学习机器学习过程中的相关代码实现,参考《机器学习实战》
from numpy import *
#构造简单测试数据
def loadSimpDat():
simpDat = [['r', 'z', 'h', 'j', 'p'],
['z', 'y', 'x', 'w', 'v', 'u', 't', 's'],
['z'],
['r', 'x', 'n', 'o', 's'],
['y', 'r', 'x', 'z', 'q', 't', 'p'],
['y', 'z', 'x', 'e', 'q', 's', 't', 'm']]
return simpDat
def createInitSet(dataSet):
retDict = {}
for trans in dataSet:
retDict[frozenset(trans)] = 1
return retDict
#FP树的类定义
class treeNode:
def __init__(self,nameValue,numOccur,parentNode):
self.name=nameValue
self.count=numOccur
self.nodeLink=None#用于链接相似的元素项
self.parent=parentNode#指向当前节点的父节点
self.children={}#当前节点字节点
def inc(self,numOccur):
self.count+=numOccur
def disp(self,ind=1):
print(' '*ind,self.name,' ',self.count)
for child in self.children.values():
child.disp(ind+1)
#~ 测试
#~ rootNode=treeNode('pyramid',9,None)
#~ rootNode.children['eye']=treeNode('eye',13,None)
#~ rootNode.children['phoenix']=treeNode('phoenix',3,None)
#~ rootNode.disp()
#FP树构建函数,数据集以及最小支持度作为参数来构建FP树
def createTree(dataSet,minSup=1):
headerTable={}#用于保存FT树的头表
for trans in dataSet:
for item in trans:
headerTable[item]=headerTable.get(item,0)+dataSet[trans]
for k in list(headerTable.keys()):
#~ print(k,headerTable[k])
if headerTable[k]<minSup:
del(headerTable[k])
freqItemSet=set(headerTable.keys())
#~ print(freqItemSet)
if len(freqItemSet)==0: return None,None
for k in headerTable:
headerTable[k]=[headerTable[k],None]#None之后用于指向下一个同类节点
retTree =treeNode('Null Set',1,None)
for tranSet,count in dataSet.items():
localD={}
for item in tranSet:
if item in freqItemSet:
localD[item]=headerTable[item][0]
if len(localD)>0:
orderedItem=[v[0] for v in sorted(localD.items(),
key=lambda p:p[1],reverse=True)]
updateTree(orderedItem,retTree,headerTable,count)
return retTree,headerTable
#树生长函数
def updateTree(items,inTree,headerTable,count):
if items[0] in inTree.children:
inTree.children[items[0]].inc(count)
else:
inTree.children[items[0]]=treeNode(items[0],count,inTree)
if headerTable[items[0]][1]==None:
headerTable[items[0]][1]=inTree.children[items[0]]
else:
updateHeader(headerTable[items[0]][1],inTree.children[items[0]])
if len(items)>1:
updateTree(items[1::],inTree.children[items[0]],headerTable,count)
#更新头表
def updateHeader(nodeToTest, targetNode): #this version does not use recursion
while (nodeToTest.nodeLink != None): #Do not use recursion to traverse a linked list!
nodeToTest = nodeToTest.nodeLink
nodeToTest.nodeLink = targetNode
#~ 测试
#~ simpDat=loadSimpDat()
#~ print(simpDat)
#~ initSet=createInitSet(simpDat)
#~ print(initSet)
#~ myFPtree,myHeaderTab=createTree(initSet,3)
#~ myFPtree.disp()
#~ 发现以给定元素项结尾的所有路径的函数
def ascendTree(leafNode,prefixPath):
if leafNode.parent!=None:
prefixPath.append(leafNode.name)
ascendTree(leafNode.parent,prefixPath)
def findPrefixPath(basePat,treeNode):
condPats={}
while treeNode !=None:
prefixPath=[]
ascendTree(treeNode,prefixPath)
if len(prefixPath)>1:
condPats[frozenset(prefixPath[1:])]=treeNode.count
treeNode=treeNode.nodeLink
return condPats
#~ simpDat=loadSimpDat()
#~ initSet=createInitSet(simpDat)
#~ myFPtree,myHeaderTab=createTree(initSet,3)
#~ myFPtree.disp()
#~ condPats=findPrefixPath('r',myHeaderTab['r'][1])
#~ print(condPats)
#~ 递归查找频繁项集的mineTree函数
def mineTree(inTree, headerTable, minSup, preFix, freqItemList):
localD = {}
for item in headerTable.keys():
localD[item]=headerTable[item][0]
bigL=[v[0] for v in sorted(localD.items(),
key=lambda p:p[1])]
#~ bigL = [v[0] for v in sorted(headerTable.items(), key=lambda p: p[0])]#(sort header table)
for basePat in bigL: #start from bottom of header table
newFreqSet = preFix.copy()
newFreqSet.add(basePat)
#~ print('finalFrequent Item: ',newFreqSet) #append to set
freqItemList.append(newFreqSet)
condPattBases = findPrefixPath(basePat, headerTable[basePat][1])
#print 'condPattBases :',basePat, condPattBases
#2. construct cond FP-tree from cond. pattern base
myCondTree, myHead = createTree(condPattBases, minSup)
#print 'head from conditional tree: ', myHead
if myHead != None: #3. mine cond. FP-tree
#~ print('conditional tree for: ',newFreqSet)
#~ myCondTree.disp(1)
mineTree(myCondTree, myHead, minSup, newFreqSet, freqItemList)
#~ simpDat=loadSimpDat()
#~ initSet=createInitSet(simpDat)
#~ myFPtree,myHeaderTab=createTree(initSet,3)
#~ freqItems=[]
#~ mineTree(myFPtree,myHeaderTab,3,set([]),freqItems)
parsedDat=[line.split() for line in open('kosarak.dat').readlines()]
initSet=createInitSet(parsedDat)
myFPtree,myHeaderTab=createTree(initSet,100000)
myFreqList=[]
mineTree(myFPtree,myHeaderTab,100000,set([]),myFreqList)
print(len(myFreqList))
print(myFreqList)