apriori.py
# -*- coding: utf-8 -*-
import numpy
# 加载数据
def loadDataSet():
return [[1, 3, 4], [2, 3, 5], [1, 2, 3, 5], [2, 5]]
# 加载出数据集中每一个物品项单独成一个集合
def creatC1(dataSet):
C1 = []
for transaction in dataSet:
for item in transaction:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset, C1)
# D是候选集,就是数据集,c1代表单物品项集合,minSupport代表最小支持度
def scanD(D,Ck,minSupport):
ssCnt = {}#ssCat用来存放键值对:键是单物品/二物品/三物品/...,值是所有数据集中包含该物品项的个数
for tid in D:
for can in Ck:
# issubset代表子集的意思
if can.issubset(tid):
# ssCnt.has_key(can)代表判断ssCat中是否存在一个叫做can的键
if not ssCnt.has_key(can):ssCnt[can] = 1
else:ssCnt[can] += 1
numItems = float(len(D))
retList = [] #用来存放满足最小支持度的集合
supportData = {}
for key in ssCnt:
support = ssCnt[key]/numItems #计算支持度
if support >= minSupport: #如果支持度满足最小支持度
retList.insert(0,key) #将相应的值放入retList
supportData[key] = support #同时将支持度放到supportData
return retList, supportData
D = loadDataSet()
C1 = creatC1(D)
L1, suppData0 = scanD(D,C1,0.5)
def aprioriGen(Lk,k):#creates CK
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i+1, lenLk):
# 第一次调用的时候k-2=0,所以相当于L1与L2里面均没有元素了
L1 = list(Lk[i])[:k-2]
L2 = list(Lk[j])[:k-2]
L1.sort()
L2.sort()
if L1==L2:
retList.append(Lk[i]|Lk[j])
return retList
# dataSet是数据集,minSupport是最小支持量
def apriori(dataSet, minSupport = 0.5):
C1 = creatC1(dataSet)
D = map(set, dataSet)
# 得到L1,和支持数据
L1, supportData = scanD(D, C1, minSupport)
# L用来存放L1,L2,L3...
L = [L1]
k = 2
while(len(L[k-2])>0):
Ck = aprioriGen(L[k-2], k)
# 下面的函数起过滤作用,过滤掉Ck中不满足最小支持率的值
Lk, supk = scanD(D, Ck, minSupport)
supportData.update(supk)
L.append(Lk)
k += 1
return L, supportData
'''
以上部分生成了满足最小支持度的频繁项目集合
'''
'''
下面的内容是从频繁项集中挖掘关联规则
'''
# minConf为最小可信度阈值,supportDate里面存放了每一个频繁项集的对应的支持度
def generateRules(L, supportData, minConf=0.7):
bigRuleList = []
for i in range(1, len(L)):
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet]
if(i>1): #当想生成的频繁项集中包含2个元素以上时调用这个函数
rulesFromConseq(freqSet, H1, supportData, bigRuleList,
else: #当想生成的频繁项集中只包含2个元素时直接调用这个函数计算可信度
calcConf(freqSet, H1, supportData, bigRuleList, minCon
return bigRuleList #生成一个包含可信度的规则列表
# 计算可信度值
def calcConf(freqSet, H, supportData, br1, minConf = 0.7):
prunedH = []
for conseq in H:
# freqSet-conseq是集合减去集合。即使freqSet中的元素减去conseq中的元素,而不是数减数
conf = supportData[freqSet]/supportData[freqSet-conseq] #可信度
if conf >=minConf:
print freqSet-conseq, '-->', conseq, 'conf:', conf
br1.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
# 用于生成候选规则集合
def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.7):
m = len(H[0])
if(len(freqSet) > (m+1)):
Hmp1 = aprioriGen(H, m+1)
Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf)
if(len(Hmp1)>1):
rulesFromConseq(freqSet, Hmp1, supportData, br1, minConf)
test.py :在这里面运行查看结果
# -*- coding: utf-8 -*-
import numpy
import apriori
dataSet = apriori.loadDataSet()
L,suppData = apriori.apriori(dataSet)
# print L
# print suppData #suppData是每一个项集的支持度
rules = apriori.generateRules(L, suppData, 0.5)
print rules
运行结果:
frozenset([3]) --> frozenset([1]) conf: 0.666666666667
frozenset([1]) --> frozenset([3]) conf: 1.0
frozenset([5]) --> frozenset([2]) conf: 1.0
frozenset([2]) --> frozenset([5]) conf: 1.0
frozenset([3]) --> frozenset([2]) conf: 0.666666666667
frozenset([2]) --> frozenset([3]) conf: 0.666666666667
frozenset([5]) --> frozenset([3]) conf: 0.666666666667
frozenset([3]) --> frozenset([5]) conf: 0.666666666667
frozenset([5]) --> frozenset([2, 3]) conf: 0.666666666667
frozenset([3]) --> frozenset([2, 5]) conf: 0.666666666667
frozenset([2]) --> frozenset([3, 5]) conf: 0.666666666667
[(frozenset([3]), frozenset([1]), 0.6666666666666666), (frozenset([1]), frozenset([3]), 1.0), (frozenset([5]), frozenset([2]), 1.0), (frozenset([2]), frozenset([5]), 1.0), (frozenset([3]), frozenset([2]), 0.6666666666666666), (frozenset([2]), frozenset([3]), 0.6666666666666666), (frozenset([5]), frozenset([3]), 0.6666666666666666), (frozenset([3]), frozenset([5]), 0.6666666666666666), (frozenset([5]), frozenset([2, 3]), 0.6666666666666666), (frozenset([3]), frozenset([2, 5]), 0.6666666666666666), (frozenset([2]), frozenset([3, 5]), 0.6666666666666666)]
参考资料:【美】Peter Harrington.《Machine Learning in Action》