#=====================================================================
# data and parameters |
#=====================================================================
db = {10:('a','c','d'),
20:('b','c','e'),
30:('a','b','c','e'),
40:('b','e')}
db1 =[set(t) for t in db.values()]
min_sup = 2
#=====================================================================
# main function |
#=====================================================================
#---------------------------------------------------------------------
# scan DB once to get frequent 1-itemset |
#---------------------------------------------------------------------
#count
table = {}
for t in db.values():
for k in t:
table[k] = table.get(k,0)+1 #remember~!
#find frequent ones
ntable = {}
for t in table:
if table[t]>=min_sup:
ntable[t]=table[t]
#---------------------------------------------------------------------
# Generate length (k+1) candidate itemsets |
#---------------------------------------------------------------------
nlist = ntable.keys()
q = 1
while(len(nlist)>0):
#---------------------------------------------------------------------
# #Step 1: self-joining |
#---------------------------------------------------------------------
print "-"*50
print "this is the ",q,"th iteration."
q += 1
candidates = []
print "item list: ", nlist
for k1 in range(len(nlist)):
for k2 in range(k1+1,len(nlist)):
a = nlist[k1]
b = nlist[k2]
if a[:-1]==b[:-1]:
c=''
if a[-1]<b[-1]:
c=a[:-1]+a[-1]+b[-1]
else:
c=a[:-1]+b[-1]+a[-1]
candidates.append(c)
#---------------------------------------------------------------------
# #Step 2: pruning |
#---------------------------------------------------------------------
print 'candidates(after self-joining):',candidates
cp = []#candidates pruned
for c in candidates:
flag = True
for k in range(len(c)):
sub = c[:k]+c[k+1:]
if sub not in ntable.keys():
flag = False
break
if flag:
m = set()
for n in c:
m.add(n)
cp.append(m)
print 'candidates(after pruning):',cp
#---------------------------------------------------------------------
# Test the candidates against DB |
#---------------------------------------------------------------------
def setToStr(s):
"""
change set to ordered string
"""
l = sorted(list(s))
r = ''
for t in l:
r+=t
return r
table = {}
for c in cp:
for t in db1:
if c.issubset(t):
table[setToStr(c)] = table.get(setToStr(c),0)+1
print 'candidates with frequency:',table
ntable = {}
for t in table:
if table[t]>=min_sup:
ntable[t]=table[t]
print 'current frequent pattern:',ntable
nlist = ntable.keys()
注意:这个版本的apriori算法只是初级版本,输入'a','b'单个字符,对于“123”,‘234’,不能用。