1.算法核心思想:Apriori算法实际上是关联规则的学习过程
2.首先要弄清楚两个指标:支持度、置信度
事件A发生的概率P(A)即为A的支持度
事件A、B同时发生的概率P(A,B)为AB的支持度
事件A发生导致事件B发生的概率P(A,B)/P(A)为A——>B这条规则的置信度,置信度是针对规则而言的
3.Apriori原理:如果某个项集是频繁的,那么它的所有子集也是频繁的;反过来看,也就是说如果一个项集是非频繁集,那么它的所有超集也是非频繁的,那么我们就可以在筛选过程中逐步去掉非频繁集,从而大大减少计算量。
4.算法伪代码:
(1)生成频繁项集列表
生成单元素项集
while 集合中项的个数大于0
构建一个k个项组成的候选项集的列表
从候选项集中筛选出频繁项集
k<——k+1
(2)生成关联规则
python代码实现:
# -*- coding:utf8 -*-
# 创建数据集
def createDataSet():
return [[1,3,4], [2,3,5], [1,2,3,5], [2,5]]
# 生成单元素项集
def createC1(dataSet):
C1 = []
for data in dataSet:
for item in data:
if not [item] in C1:
C1.append([item])
C1.sort()
return map(frozenset, C1)
# 从候选集中筛选频繁集
def scanD(D, Ck, minSupport):
ssCnt = {}
for tid in D:
for can in Ck:
if can.issubset(tid):
if not ssCnt.has_key(can):
ssCnt[can] = 1
else:
ssCnt[can] += 1
numItems = float(len(D))
retList = []
supportData = {}
for key in ssCnt:
support = ssCnt[key]/numItems
if support >= minSupport:
retList.append(key)
supportData[key] = support
return retList, supportData
# 生成候选集
def aprioriGen(Lk, k):
retList = []
lenLk = len(Lk)
for i in range(lenLk):
for j in range(i+1, lenLk):
# 前k-2项相同时,将两个集合合并
L1 = list(Lk[i])[:k-2]; L2 = list(Lk[j])[:k-2]
L1.sort(); L2.sort()
if L1==L2:
retList.append(Lk[i] | Lk[j])
return retList
# 生成频繁项集列表
def apriori(dataSet, minSupport=0.5):
C1 = createC1(dataSet)
D = map(set, dataSet)
L1, supportData = scanD(D, C1, minSupport)
L = [L1]
k = 2
while(len(L[k-2]) > 0):
Ck = aprioriGen(L[k-2], k)
Lk, supK = scanD(D, Ck, minSupport)
supportData.update(supK)
L.append(Lk)
k += 1
return L, supportData
# 生成关联规则
def generateRules(L, supportData, minConf=0.7):
bigRuleList = []
for i in range(1, len(L)):
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet]
if (i>1):
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList
# 计算置信度
def calcConf(freqSet, H, supportData, brl, minConf=0.7):
prunedH = []
for conseq in H:
conf = supportData[freqSet]/supportData[freqSet-conseq]
if conf >= minConf:
print freqSet-conseq, '-->', conseq, 'conf:', conf
brl.append((freqSet-conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
# 生成候选规则集合
def rulesFromConseq(freqSet, H, supportData, brl, minConf=0.7):
m = len(H[0])
if (len(freqSet) > (m+1)):
Hmpl = aprioriGen(H, m+1)
Hmpl = calcConf(freqSet, Hmpl, supportData, brl, minConf)
if (len(Hmpl)>1):
rulesFromConseq(freqSet, Hmpl, supportData, brl, minConf)
if __name__ == '__main__':
dataSet = createDataSet()
L,supportData = apriori(dataSet, minSupport=0.5)
# print L
rules = generateRules(L, supportData, minConf=0.7)
print rules