Apriori
买A的人,有多大的概率也会买B
c
o
n
f
i
d
e
n
c
e
(
A
t
o
B
)
=
N
(
买
A
也买
B
的
t
r
a
n
s
a
c
t
i
o
n
)
N
(
只买
A
的
t
r
a
n
s
a
c
t
i
o
n
)
confidence(A_{to}B)=\frac{N(买A也买B的transaction)}{N(只买A的transaction)}
confidence(AtoB)=N(只买A的transaction)N(买A也买B的transaction)
所以需要从买1件商品、买2件商品,买3件商品开始逐一统计。速度很慢
- 调用函数
import numpy as np
import pandas as pd
from apyori import apriori
store_data = pd.read_csv('data_path', header=None)
#convert dataframe into a list of lists
records = []
for i in range(0, 7501):
#吧每行数据转变为一list of str
records.append([str(store_data.values[i,j]) for j in range(0, 20)])
association_rules = apriori(records, min_support=0.0045, #transaction至少要出现的次数
min_confidence=0.2, #confidence最小
min_lift=3,
min_length=2) #一个list内至少要有两个products
association_results = list(association_rules)
print(association_rules[0]) #逐条查看rules
- 手动实现
def createC1(dataSet): #先统计有多少种商品
C1 = set()
for transaction in dataSet:
for item in transaction:
C1.add(item)
C1 = list(C1)
C1.sort()
#map(lambda param:func,input)
return map(lambda x:frozenset([x]), C1)
#这里是对C1里每个元素都进行frozenset()操作;
#frozenset即创建之后不能修改,这里创建Key,重点是frozenset有A.issubset(B)可以来判断是否为子集
# 计算Ck中每个单元素项的支持度>minSupport,并过滤
def scanD(Dataset, Ck, minSupport):
#这里的Ck每个里面只有一个元素,dataset是所有transaction的合集
ssCnt = {}
#1.提取一条transaction,然后拿每个Can(只有一个元素)去匹配,看是不是transaction的subset
for trans in Dataset: #遍历dataset里每条transaction
for can in Ck:
if can.issubset(trans): #如果这个Ck set是这个transaction的子集
#更新Ck进ssCnt这个dictionary里
if not ssCnt.has_key(can): #如果还没统计过这个Ck
ssCnt[can] = 1 #初始化
else:
ssCnt[can] += 1
retList = [] # Lk
supportData = {} # 支持度(出现比例)
#2.过滤单元素项是否>minsupport
for key in ssCnt:
support = ssCnt[key] / float(len(Dataset))
if support >= minSupport: #阈值过滤
retList.insert(0, key)
supportData[key] = support
return retList, supportData
# 构造下一个候选集Ck ;比如输入{0}、{1}、{2},就会输出{0,1}、{0,2}、{1,2}
def aprioriGen(Lk, k): #k是这个Lk里每个set里元素的个数,会逐渐增大
retList = []
lenLk = len(Lk)
#把Lk里任意两个组合尝试,变成长度+1的一项;取一项,取前面k-2项,如果一样,那就可以可以合并
for i in range(lenLk):
for j in range(i+1, lenLk): #从i的后一个开始挑选,组合
L1 = list(Lk[i])[:k-2] #取k-2前面的k-1项
L2 = list(Lk[j])[:k-2]
L1.sort() #先排序
L2.sort()
if L1 == L2: # 如果它们前k-1项相同
retList.append(Lk[i] | Lk[j]) # 合并,并加入list中
return retList
def apriori(dataSet, minSupport=0.5):
C1 = createC1(dataSet) #先创建一个所有单个商品的list
D = map(set, dataSet) #map(func,iteration),把dataset里每个transaction都变为set()
#先从所有单个商品中,筛选出所有符合minSupport的L1
L1, supportData = scanD(D, C1, minSupport)
#L1这个list记录所有满足minSupport的单个商品,supportData是记录每个单品support的dictionary
L = [L1] #L是存放所有符合过滤条件的Lk的,这里初始化为L1
k = 2
while len(L[k-2])>0: #确保L里符合条件的集合至少有2个(即至少有两个单件商品)
Ck = aprioriGen(L[k-2], k)
#从2个元素set开始组合,得到一个list,里面每个element的长度是之前set长度+1
Lk, supK = scanD(D, Ck, minSupport) # 扫描并过滤
supportData.update(supK) #把符合minSuypport继续加入supportData dictionary中
L.append(Lk) #把符合条件的Lk加L内
k += 1 #k+1,继续去拼凑长度+1的元素set
return L, supportData
主函数:generateRules(),三个输入:频繁项集列表L、支持度表supportData和最小可信度minConf,输出一个包含可信度的规则列表bigRuleList
# 计算可信度
def calcConf(freqSet, H, supportData, br1, minConf=0.7):
prunedH = []
for conseq in H:
conf = supportData[freqSet] / supportData[freqSet - conseq]
if conf >= minConf: # 过滤
# print "{0} --> {1} conf:{2}".format(freqSet - conseq, conseq, conf)
br1.append((freqSet - conseq, conseq, conf))
prunedH.append(conseq)
return prunedH
def rulesFromConseq(freqSet, H, supportData, br1, minConf=0.7):
m = len(H[0])
if len(freqSet) > m+1:
Hmp1 = aprioriGen(H, m+1)
Hmp1 = calcConf(freqSet, Hmp1, supportData, br1, minConf)
if len(Hmp1)>1:
rulesFromConseq(freqSet, Hmp1, supportData, br1, minConf)
#遍历L中每一个频繁项集,对每个频繁项集创建只包含单个元素集合的列表H1,
#如果每个项集只有1个元素,直接用calcConf()计算可信度,否则用rulesFromConseq()进行合并。
def generateRules(L, supportData, minConf=0.7):
bigRuleList = []
for i in range(1, len(L)):
for freqSet in L[i]:
H1 = [frozenset([item]) for item in freqSet]
if i>1:
rulesFromConseq(freqSet, H1, supportData, bigRuleList, minConf)
else:
calcConf(freqSet, H1, supportData, bigRuleList, minConf)
return bigRuleList