Apriori:使用逐层迭代方法基于候选产生找出频繁项集
输入:
- D:事务数据库
- min_support:最小支持度阈值【p(AB)】
输出:L,D中的频繁项集
方法:
- L1=find_frequent_1_itemsets(D)
- for(k=2;Lk-1 != Null;k++){
- Ck=apriori_generate(Lk-1)
- for each 事务t属于D{ //扫描D进行计数
- Ct=subset(Ck,t) //得到t的子集,它们是候选
- for each 候选c属于Ct
- c.count++
- }
- Lk={c(Ck|c.count>=min_support)}
- }
- return L+=Lk
apriori_generate(Lk-1:frequent_(k-1)_itemset)
- for each 项集l1属于Lk-1
- for each 项集l2属于Lk-1
- if(l1[1]=l2[1] and ...l1[k-2]=l2[k-2] and l1[k-1]<l2[k-2]) then
- c=l1和l2连接(可直接 异或 得到)
- if has_infrequent_subset(c,Lk-1) then//如果有非频繁候选项集则进行剪枝【利用先验性质:频繁项集的所有非空子集也一定是频繁的】
- delete c
- else add c to Ck
- }
- return Ck
has_infrequent_subset(c,Lk-1)
- for each(k-1)subset s of c
- if s 不属于 Lk-1 then
- return True
- return False
参考此链接:https://www.cnblogs.com/llhthinker/p/6719779.html
#coding:utf-8
import scrapy
import xlwt, lxml
import re, json
import matplotlib.pyplot as plt
import numpy as np
import pylab,math
from scipy import linalg
def loadDataSet():
dataSet=[['l1', 'l2', 'l5'], ['l2', 'l4'], ['l2', 'l3'],['l1', 'l2', 'l4'], ['l1', 'l3'], ['l2', 'l3'],['l1', 'l3'], ['l1', 'l2', 'l3', 'l5'], ['l1', 'l2', 'l3']]
return dataSet
def createC1(dataSet):#得到频繁1项集的集合
c1=set()
for t in dataSet:
for item in t:
itemSet=frozenset([item])
#set无序且不重复,是可变的,有add(),remove()等方法;基本功能包括关系测试和消除重复元素. 集合对象还支持union(联合), intersection(交集), difference(差集)和sysmmetric difference(对称差集)等数学运算.
#frozenset 是冻结的集合,它是不可变的,存在哈希值,它可以作为字典的key,也可以作为其它集合的元素。缺点是一旦创建便不能更改,也没有add,remove()方法
c1.add(itemSet)
# print(c1)
return c1
# createC1(loadDataSet())
def isApriori(CkItem,Lksub1 ):#使用先验知识检测Lk是否为频繁项集【频繁项集的所有非空子集也一定是频繁的】
#CkItem:候选k项集的集合
#Lksub1:Lk-1,频繁k-1项集
for item in CkItem:
subCk=CkItem-frozenset([item])
# print(subCk)
if subCk not in Lksub1:
return False
return True
def createCk(Lksub1,k):
Ck=set()
length=len(Lksub1)
Lksub1List=list(Lksub1)
for i in range(length):
for j in range(1,length):
l1=list(Lksub1List[i])
l2=list(Lksub1List[j])
l1.sort()
l2.sort()
if l1[0:k-2]==l2[0:k-2]:
CkItem=Lksub1List[i] | Lksub1List[j]
if isApriori(CkItem,Lksub1):
Ck.add(CkItem)
return Ck
def generate_Lk_by_Ck(dataSet,Ck,min_support,support_data):
# print(Ck)
# print('+++++++++++++++++++++++++++')
Lk=set()
item_count={}#统计并记录每一个Ck子集的出现次数
for t in dataSet:
for item in Ck:
if item.issubset(t):#判断item是否在事务数据集中
if item not in item_count:
item_count[item]=1
else:
item_count[item]+=1
t_num=len(dataSet)
for item in item_count:#满足支持度大于最小支持度的Ck子集添加到频繁项集Lk中,最终返回频繁k项集
if item_count[item]/t_num>=min_support:
Lk.add(item)
support_data[item]=item_count[item]/t_num
# print(item_count)
# print('+++++++++++++++++++++++++++++')
return Lk
def generate_L(dataSet,k,min_support):
support_data={}
C1=createC1(dataSet)
L1=generate_Lk_by_Ck(dataSet,C1,min_support,support_data)
Lksubl=L1.copy()
L=[]
L.append(Lksubl)
for i in range(2,k+1):
Ci=createCk(Lksubl,i)
Li=generate_Lk_by_Ck(dataSet,Ci,min_support,support_data)
Lksubl=Li.copy()
L.append(Lksubl)
return L,support_data
def generate_big_rules(L,support_data,min_conf):#生成关联规则
big_rule_list=[]
sub_set_list=[]
for i in range(len(L)):
for freq_set in L[i]:
for sub_set in sub_set_list:
if sub_set.issubset(freq_set):
conf=support_data[freq_set]/support_data[freq_set-sub_set]
big_rule=(freq_set-sub_set,sub_set,conf)
if conf>=min_conf and big_rule not in big_rule_list:
big_rule_list.append(big_rule)
sub_set_list.append(freq_set)
return big_rule_list
if __name__ == '__main__':
dataSet=loadDataSet()
L,support_data=generate_L(dataSet,k=3,min_support=0.2)
big_rule_list=generate_big_rules(L,support_data,min_conf=0.7)
for Lk in L:
print('frequent'+str(len(list(Lk)[0]))+'-itemsets\t\tsupport')
print()
for freq_set in Lk:
print(freq_set,support_data[freq_set])
print()
print('Big Rules')
for item in big_rule_list:
print(item[0],'=>',item[1],'conf: ',item[2])