就是频繁项集挖掘,FP-Growth算法。
先产生headerTable:
数据结构(其实也是调了好几次代码才确定的,因为一开始总有想不到的东西):entry: entry: {authorName: frequence, firstChildPointer, startYear, endYear}
def CreateHeaderTable(tranDB, minSupport=1):
headerTable={} #entry: entry: {authorName: frequence, firstChildPointer, startYear, endYear}
authorDB={} #entry: {frozenset(authorListSet): frequence}
for i, (conf, year, authorList) in enumerate(tranDB):
authorListSet=set([])
print "trans", i, "=="*20
if conf is np.nan or year is np.nan or authorList is np.nan:
continue #for tranDB[2426,:]
for author in authorList.split("|"):
authorListSet.add(author)
if headerTable.has_key(author):
headerTable[author][0]+=1
if year<headerTable[author][2]:
headerTable[author][2]=year
elif year>headerTable[author][3]:
headerTable[author][3]=year
else:
headerTable[author]=[1, None, year, year]
if authorDB.has_key(frozenset(authorListSet)):
authorDB[frozenset(authorListSet)]+=1
else:
authorDB[frozenset(authorListSet)]=1
for author in headerTable.keys():
if headerTable[author][0]<minSupport:
del headerTable[author]
return headerTable, authorDB
再构建FP-Tree:
每个treeNode又五元组来描述:
class TREE_NODE:
def __init__(self, authorName, frequence, parentPointer):
self.authorName=authorName
self.frequence=frequence
s