#!/usr/bin/python#-*- coding: UTF-8 -*-
importredef read_data(): #读取数据
return [['bread', 'cream', 'milk', 'tea'], ['bread', 'cream', 'milk'], ['cake', 'milk'], ['milk', 'tea'], ['bread', 'cake', 'milk'], ['bread', 'tea'], ['beer', 'milk', 'tea'], ['bread', 'tea'], ['bread', 'cream', 'milk', 'tea']]def find_all_frequent_1_itemsets(data_set,minsup,support_count): #发现所有频繁1项集,support_count为支持度计数字典,在函数内为此字典填充频繁一项集的值
frequent_1_itemsets=set()for transaction indata_set:for item intransaction:if frozenset([item]) not insupport_count:
support_count[frozenset([item])]=1
else:
support_count[frozenset([item])]+=1
for item insupport_count:if (float(support_count[item])/len(data_set))>=minsup:
frequent_1_itemsets.add(item)returnfrequent_1_itemsetsdef apriori_gen(frequent_k_sub_1_itemset): #由频繁k-1项集生成候选k项集(频繁k-1项集)
candidate_k_itemsets=set()for l1 infrequent_k_sub_1_itemset:for l2 infrequent_k_sub_1_itemset:if isinstance(l1,frozenset): #set无序,所以转换成list按字典序排序,l1_临时存下,不改变l1本身
l1_=list(l1)
l2_=list(l2)else:
l1_=list([l1])
l2_=list([l2])
l1_.sort()
l2_.sort()if l1_<>l2_:if l1_[0:-1]==l2_[0:-1]: #若前k-2项相同,则合并
candidate_k_itemsets.add(frozenset(set(l1_+l2_))) #frozenset后才可将set加入set
returncandidate_k_itemsetsdef subset(candidate_k_itemsets,transaction): #属于t的所有候选
Ct=set()for candidate_k incandidate_k_itemsets:ifset(candidate_k).issubset(set(transaction)):
Ct.add(candidate_k)returnCtdef extract_the_frequent_K_itemsets(data_set,candidate_k_itemsets,minsup,support_count): #由候选k项集提取频繁k项集(数据集,候选k项集,最小支持度,支持度数据)
frequent_K_itemsets=set()for k_itemsets incandidate_k_itemsets:if (float(support_count[k_itemsets])/len(data_set))>=minsup:
frequent_K_itemsets.add(k_itemsets)returnfrequent_K_itemsetsdef gen_frequent_itemsets(data_set,minsup): #ap算法主体,生成频繁项集,返回结果和频繁项支持度计数
k=1support_count={} #定义存支持度计数的字典,key为一条k项,value为支持度计数
Fk=find_all_frequent_1_itemsets(data_set,minsup,support_count) #发现所有频繁1项集,传入support_count,在函数内赋值
result=[] #结果,储存所有频繁项集
result.append(Fk)whileFk:
k=k+1Fk_sub_1=Fk
Ck=apriori_gen(Fk_sub_1) #产生候选项集
for i inCk:
support_count[frozenset(i)]=0 #初始化字典
for t indata_set:
Ct=subset(Ck,t) #识别属于t的所有候选集
for c inCt:
support_count[frozenset(c)]+=1 #支持度计数
Fk=extract_the_frequent_K_itemsets(data_set,Ck,minsup,support_count) #提取频繁k项集
ifFk:
result.append(Fk)#筛选,只保留字典中的频繁项,且把频繁度计数转换为频繁度
tmp={}for i insupport_count:if support_count[i]>=minsup:
tmp[i]=float(support_count[i])/len(data_set)
support_count=tmpreturnresult,support_countdef gen_related_rule(freq_set,support_data,min_conf): #生成关联规则(频繁项集,支持度字典,最小置信度)
related_rule_list=[] #定义关联规则列表
sub_set_list = [] #定义子集列表
for frequent_K_itemsets in freq_set: #遍历频繁项集
for freq_k_item in frequent_K_itemsets: #遍历频繁k项集,遍历出每条频繁记录
sub_set_list.append(freq_k_item)for sub_set insub_set_list:if freq_k_item==sub_set:continue
ifsub_set.issubset(freq_k_item):
conf= support_data[freq_k_item] / support_data[freq_k_item - sub_set] #计算置信度
related_rule = (freq_k_item -sub_set, sub_set, conf)if conf >= min_conf and related_rule not inrelated_rule_list:
related_rule_list.append(related_rule)returnrelated_rule_list
data_set=read_data()
minsup=0.2minconf=0.7freq_set,support=gen_frequent_itemsets(data_set,minsup)
related_rule=gen_related_rule(freq_set,support,minconf)
k=1
for frequent_K_itemsets infreq_set:print str("frequent"+str(k)).ljust(70),"support"
print "="*90k+=1
for frequent_K_item infrequent_K_itemsets:print str(frequent_K_item).ljust(70),support[frequent_K_item]print "="*90
print "related_rule".ljust(70),"conf"
print "="*90
for i inrelated_rule:print (str(i[0])+"=>"+str(i[1]) ).ljust(70), str(i[2])print "="*90