#coding=utf-8
def loadDataSet():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
#创建集合C1,集合C1为所有大写为1的候选集的集合
def createC1(dataSet):
C1 = []
for transation in dataSet:
for item in transation:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset, C1) #将C1中的数据类型转换成冰冻类型frozenset,冰冻类型指不可改变的类型
#从集合C1生成L1,D:数据集,Ck:候选项集,minSupport:最小支持度
def scanD(D, Ck, minSupport):
ssCnt = {}
for tid in D:
for can in Ck:
if can.issubset(tid):
if not ssCnt.has_key(can): ssCnt[can] = 1
else: ssCnt[can] += 1
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:
support = ssCnt[key] / numItems
if support >= minSupport:
retList.insert(0, key)
supportData[key] = support
return retList, supportData #retList满足最小支持度要求的候选项集, 以及对应的支持度
#Lk:频繁项集,k:项集元素个数,
def aprioriGen(Lk, k):
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i + 1, lenLk):
L1 = list(Lk[i])[: k - 2]
L2 = list(Lk[j])[: k - 2]
L1.sort(); L2.sort()
if L1 == L2: #如果前面k-2个元素都相等,则合并成k的项集
retList.append(Lk[i] | Lk[j])
return retList #输出Ck
def apriori(dataSet, minSupport = 0.5):
C1 = createC1(dataSet)
D = map(set, dataSet)
L1, supportData = scanD(D, C1, minSupport)
L = [L1]
k = 2
while(len(L[k - 2]) > 0):
Ck = aprioriGen(L[k - 2], k)
Lk, supK = scanD(D, Ck, minSupport)
supportData.update(supK)
L.append(Lk)
k += 1
return L, supportData
#L:频繁项集列表,supportData:包含那些频繁项集支持数据的字典,minConf:最小可信度阈值
def generateRules(L, supportData, minConf = 0.7):
bigRuleList = [] #包含可信度的规则列表
for i in range(1, len(L)):
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet]
if (i > 1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
#对候选规则集合进行评估
def calcConf(freqSet, H, supportData, br1, minConf = 0.7):
prunedH = []
for conseq in H:
conf = supportData[freqSet] / supportData[freqSet-conseq]
if conf >= minConf:
print freqSet-conseq, '-->', conseq, 'conf:', conf
br1.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH #返回满足最小可信度要求的规则列表
#生成候选规则集合
def rulesFromConseq(freqSet, H, supportData, br1, minConf = 0.7):
m = len(H[0]) #计算频繁项集的大小
if (len(freqSet)) > (m + 1):
hmp1 = aprioriGen(H, m + 1)
hmp1 = calcConf(freqSet, hmp1, supportData, br1, minConf) #如果某条规则不满
#足可信度要求,那么该规则的所有子集也不满足可
#信度要求,如果0,1,2->3是一条低可信度规则,那么以3为后件的都是低可信度规则
if(len(hmp1) > 1):
rulesFromConseq(freqSet, hmp1, supportData, br1, minConf)
总结
- 发现元素项间不同组合是个十分耗时的任务,不可避免需要大量昂贵的计算资源,这就需要一些更智能的方法在合理的时间范围内找到频繁项集。能够实现这一目标的一个方法是Apriori算法,它使用Apriori原理来减少在数据库上进行检查的集合的数目,Apriori原理就是说如果一个元素是不频繁项集,那么那么包含该元素的超集也是不频繁的。
- 每次增加频繁项集的大小,Apriori算法都会重新扫描整个数据集。当数据集很大时,这会显著降低频繁项集的发现速度。