http://mayuxiang.sinaapp.com/?p=216
关注另一种数据挖掘算法,这种算法主要用于频繁项集的发现。
熟悉网上购物的同学,一定都了解什么是“购物车” ,或者至少知道在超市中购物所推的购物车吧。亚马逊最早提出了一个算法,就是基于购物车的内容来为用户做出新的推荐。
一个简单的例子来看看什么是A-priori 算法
#!/usr/bin/python
#coding:utf-8
data = [['Cat','and','dog','bites'],
['Yahoo','news','claims','cat','mated','with','a','dog','and','produced',
'viable','offspring'],
['Cat','Killer','likely','is','a','big','dog']
,['Professional','free','advice','on','dog','puppy','training'],
['Cat','and','kitten','training','behavior'],
['Dog','&','Cat','providers','dog','training','in','Engene','Oregon'],
['Dog','and','cat','is','slang','term','used','by','police','officers',
'for','a','male-female','relationship'],['Shop','for','your','show','dog',
'grooming','and','pet','suppliers']]
#wash the data and keep words unique in each row
def delDuplication(data):
pass #ignore the implementing process here
#ignore those words which rarely appeared
#Threhold: set the smallest appear times
def supportThreshold(res,threhold=1):
res2={item:res[item] for item in res if len(res[item]) >= threhold}
return res2
def statusCount():
res = {} #define a dictionary to restore each word and its position
delDuplication(data) #delete duplication words in data list
for i in range(0,len(data)):
for word in data[i]:
if res.has_key(word) == False:
res[word]=[]
res[word].append(i)
res = supportThreshold(res,3)
print "一维关联:",res
return res
def comparePos(posX,posY):
# compare the two position list to find out
# if the two word appear in same sentences
pos =[]
for p1 in posX:
for p2 in posY:
if p1 == p2:
pos.append(p1)
break
elif p2 > p1:
break
return pos
def apriori(res): # apriori aglrithom, form a {res,res} conjunction pair
hres={}
for itemX in res:
for itemY in res: #compare with all elements behind it
if itemX!= itemY and not hres.has_key((itemX,itemY)) and not hres.has_key((itemY,itemX)):
pos = comparePos(res[itemX],res[itemY])
pair = (itemX,itemY)
hres[pair] = pos
hres = supportThreshold(hres,3)
return hres
if __name__ == "__main__":
oneDe =statusCount()
twoDe =apriori(oneDe)
print "二维关联:",twoDe
fourDe=apriori(twoDe)
print "四维关联:",fourDe