自己的一点点领悟,可能会有点小错误,欢迎交流^_^
获得频繁项集
主要思想
python代码
def loadDataSet():
return [[1,3,4],[2,3,5],[1,2,3,5],[2,5]]
createC1(dataSet)获得所有第一层的所有项集
def createC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset,C1)
#scanD是根据训练数据D,来判断Ck里面一堆的项集是否是频繁的。
def scanD(D,Ck,minSupport):
ssCnt = {}
for tid in D:
for can in Ck:
if can.issubset(tid):
if not ssCnt.has_key(can): ssCnt[can] = 1
else: ssCnt[can] += 1
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:
support = ssCnt[key] / numItems
if support >= minSupport:
retList.insert(0,key)
supportData[key] = support