实验的时候发现算法主要时间来自于C1 -> L1的构建,因此对L1的产生进行了优化,简单记录
def loadDataSet(filename):
f= open(filename,'r')
dataSet = []
for line in f.readlines():
l = line.split('{')[1][:-3].split(',')
dataSet.append(l)
return dataSet
def createL1(dataSet,minsupport = 0.5):
C1 = {}
C2 = []
supportData = {}
for transaction in dataSet:
for item in transaction:
C1[item] = C1.get(item,0) + 1
l = float(len(dataSet))
print('样本数量为'+str(l))
p = l* minsupport
for v in C1.items():
supportData[frozenset([v[0]])] = float(v[1])/l
if(v[1] >= p):
C2.append([v[0]])
return list(map(frozenset,C2)),supportData
def scanD(D,Ck,minsupport):
ssCnt = {}
for tid in D:
for can in Ck:
if can.issubset(tid):
if can not in ssCnt: ssCnt[can] = 1
else: ssCnt[can] += 1
print('统计中')
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:
support = ssCnt[key]/numItems
if support >= minsupport:
retList.insert(0,key)
supportData[key] = support
return retList ,supportData
def aprioriGen(Lk,k):
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i+1,lenLk):
L1 = list(Lk[i])[:k-2]; L2 =list(Lk[j])[:k-2]
L1.sort();L2.sort()
if L1 == L2:
retList.append(Lk[i] | Lk[j])
return retList
def apriori(dataSet,minSupport):
D = list(map(set,dataSet))
L1,supportData = createL1(dataSet,minSupport)
print('L1,SupportData1 created')
#L1,supportData = scanD(D,C1,minSupport)
L = [L1]
k = 2
while(len(L[k-2])>0):
Ck = aprioriGen(L[k-2],k)
Lk ,supk = scanD(D,Ck,minSupport)
supportData.update(supk)
L.append(Lk)
k+=1
return L,supportData
def generateRules(L,supportData,minConf = 0.7):
bigRuleList = []
for i in range(1,len(L)):
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet]
if(i>1):
rulesFromConseq(freqSet,H1,supportData,bigRuleList,minConf)
else:
calcConf(freqSet,H1,supportData,bigRuleList,minConf)
return bigRuleList
def calcConf(freqSet,H,supportData,brl,minconf = 0.7):
prunedH = []
for conseq in H:
conf = supportData[freqSet]/supportData[freqSet- conseq]
if(conf >= minconf):
print(str(freqSet-conseq)+'--->'+str(conseq)+' 可信度为:'+str(conf))
brl.append((freqSet-conseq,conseq,conf))
prunedH.append(conseq)
return prunedH
def rulesFromConseq(freqSet,H,supportData,brl,minConf = 0.7):
m = len(H[0])
#print(m)
#print(freqSet)
if(len(freqSet)>(m+1)):
#print(Hmp1)
Hmp1 = calcConf(freqSet,H,supportData,brl,minConf)
Hmp1 = aprioriGen(Hmp1,m+1)
if(len(Hmp1)>1):
rulesFromConseq(freqSet,Hmp1,supportData,brl,minConf)
if __name__ == '__main__':
filename = 'Groceries.csv'
#dataSet = loadDataSet(filename)
dataSet = [line.split() for line in open('kosarak.dat').readlines()]
L,supportData = apriori(dataSet,0.05)
rules = generateRules(L,supportData, 0.9)