这个版本的apriori有优化过程,存储每个频繁项集的在数据库的分布,通过类似于与运算来更新状态,这个步骤可以用二分查找优化。
import time
# 读取数据
data = []
with open("C://Users//86180//Desktop//数据挖掘//retail.dat") as f:
line = f.readline()
while line:
data.append([int(i) for i in line.split()])
line = f.readline()
datamap = {}
for i in range(0, len(data)):
for j in range(0, len(data[i])):
key = data[i][j]
if datamap.get(key):
datamap[key] += 1
else:
datamap[key] = 1
# 构建一项候选集的状态存储表index_1
n, m = len(keys), len(data)
index_1 = []
start = time.time()
for i in range(n):
index_1.append([])
for i in range(m):
for item in data[i]:
index_1[item].append(i)
end = time.time()
print("运算时间:",end - start, "s", sep="")
def createCandidateList(F):
'''
Param:
F: 前一级频繁项集
Procedure:
1. 两重循环遍历F
2. 先排序,后判断前k - 1个元素是否相等
3. 相等的话就把两个集合取交集存入候选序列
4. 返回候选序列
'''
Ck = []
for i in range(0, len(F)):
for j in range(i + 1, len(F)):
L1 = list(F[i])
L2 = list(F[j])
list.sort(L1)
list.sort(L2)
if L1[:-1] == L2[:-1]:
Ck.append(F[i] | F[j])
return Ck
def prune(Ck, F):
'''
Param:
Ck:候选集
F:前一级频繁集
I:前一级频繁集元素在数据库中的状态
Procedure:
1. 遍历Ck,判断每一个候选项的子集是否是频繁集
2. 如果是的话就加入剪枝后的候选集
'''
Fk = []
for candidate in Ck:
flag = True
for i in range(0, len(candidate) - 2):
t = list(candidate)[i]
if frozenset(set(candidate) - set([t])) not in F:
flag = False
if flag:
Fk.append(frozenset(candidate))
return Fk
import bisect
def transferStatus(I, Ck):
'''
Param:
I:上一级的状态图
Ck:剪完枝的候选集
Procedure:
1. 遍历Ck,在I中得到前k-1项的数据库存储状态和最后一项的状态
2. 两者求交集,这里采用bisect二分法时间复杂度为nlog(m)
'''
Ik = {}
for item in Ck:
l, r = list(item)[:-1], list(item)[-1]
idx = []
p = 0
lst = I[frozenset(list(l))]
for i in range(len(lst)):
t = lst[i]
p = bisect.bisect_left(index_1[r], t, p)
if p == len(index_1[r]):
break
if index_1[r][p] == t:
p = p + 1
idx.append(t)
Ik[item] = idx
del I
return Ik
def scanDB(mincount, Ck, I, SupportData):
'''
Param:
Supp: 频繁项集支持度字典
freqSet:当前频繁集
S:频繁子集
rules:符合要求的关联规则
minConf:最小置信度
Procedure:
1. 遍历S,计算每一个频繁子集的置信度
2. 对每一个频繁子集计算置信度和lift值
3. 置信度>最小置信度,lift>1则为强关联规则
'''
Lk = []
for item in Ck:
ll = len(I[item])
if ll >= mincount:
Lk.append(item)
SupportData[item] = ll
return Lk
def Apriori(data, minSupport):
'''
Param:
Supp: 频繁项集支持度字典
freqSet:当前频繁集
S:频繁子集
rules:符合要求的关联规则
minConf:最小置信度
Procedure:
1. 遍历S,计算每一个频繁子集的置信度
2. 对每一个频繁子集计算置信度和lift值
3. 置信度>最小置信度,lift>1则为强关联规则
'''
total_fre = 0
minSupportCount = minSupport * len(data)
print("最小支持度数量:", minSupportCount)
# 创建一项频繁集,存储第一项集的支持度
F = []
SupportData = {}
for key in keys:
if datamap[key] >= minSupportCount:
F.append(frozenset([key]))
SupportData[frozenset([key])] = datamap[key]
print(F)
total_fre += len(F)
print("第1项集的个数:%d" %(len(F)))
# 创建一项频繁集的状态图
I = {} # 存取上一个状态的频繁集的记录信息
for item in F:
t = list(item)[0]
I[item] = index_1[t]
# k代表现在计算k项频繁集
k = 2
L = [F] # 存储每一级频繁项集
while len(F) >= k:
# 创建k项候选集
Ck = createCandidateList(F)
print("第%d候选项集个数:%d" %(k, len(Ck)), end=' ')
# 剪枝
start = time.time()
Ck= prune(Ck, F)
end = time.time()
print("第%d项集剪枝后的个数:%d" %(k, len(Ck)), end=' ')
print(end - start, 's', end=" ")
# 状态转移图
start = time.time()
I = transferStatus(I, Ck)
end = time.time()
print("状态转移时间:", end-start, 's', end=' ')
# 查数据库
start = time.time()
F = scanDB(minSupportCount, Ck, I, SupportData)
L.append(F)
end = time.time()
print("第%d项集个数:%d" %(k, len(F)), end=' ')
print(end - start, 's')
total_fre += len(F)
k += 1
print(SupportData)
print("总频繁项集个数:", total_fre)
return SupportData, L
start = time.time()
SupportData, L = Apriori(data, 0.005)
print(SupportData)
end = time.time()
print("运算时间:",end - start, "s", sep="")