这周学习了数据挖掘中计算频繁项集的aprioir算法,
老师让用python实现一下,自己按照《数据挖掘概念与技术》第六章上面讲解的过程实现
如果有大佬发现问题,欢迎提出意见
实现如下
"""
aprioir算法
@author: liuyinxin
"""
# 设置支持度为2
support = 2
def load_data():
"""
:return: 加载数据
"""
data = []
with open('test_data.txt', 'r') as f:
while True:
line = f.readline()
if not line:
break
data.append([int(_) for _ in line.split()])
return data
def freq_1_items(data, sup=2):
"""
找到1频繁项集
:param
:return:
"""
L = {}
for item in data:
for i in item:
if i not in L:
L[i] = 1
else:
L[i] += 1
return sorted([[k] for k, v in L.items() if v >= sup])
def check_join(a, b, k):
"""
检查是否可以链接, 前n-1项相同 并且 第n项不相同
:param a:
:param b:
:param k:
:return:
"""
k -= 1
for i in range(0, k):
if a[i] != b[i]:
return False
return a[k] < b[k]
def iscut(L, c):
"""
剪枝函数, 当c的真子集不在L中时,剪去c
:param L:
:param c:
:return:
"""
c_s = set(c)
for l in L:
if not l.issubset(c_s):
return False
return True
def aprioir_gen(L, k):
Ck = []
lens = len(L)
set_L = list(map(set, L))
for i in range(lens):
for j in range(i + 1, lens):
a, b = L[i], L[j]
if check_join(a, b, k):
c = [_ for _ in a]
c.append(b[-1])
if not iscut(set_L, c):
Ck.append(c)
return Ck
def aprioir(data, min_sup):
"""
aprioir算法
:param data:
:param min_sup:
:return:
"""
Lk = [freq_1_items(data, min_sup)]
dataset = list(map(set, data))
k = 1
while len(Lk[k-1]) > 0:
L = Lk[k-1]
C = aprioir_gen(L, k)
# 统计,除去支持度低的
C_list = [(i, c) for i, c in enumerate(map(set, C))]
count_dic = {}
for d in dataset:
for i, c in C_list:
if c.issubset(d):
if count_dic.get(i, None) is None:
count_dic[i] = 1
else:
count_dic[i] += 1
Ck = [C[k] for k, v in count_dic.items() if v >= min_sup]
Ck.sort()
Lk.append(Ck)
k += 1
return Lk
L = aprioir(load_data(), 2)
for i, l in enumerate(L):
print('i = ', i + 1, ' L=', l)
输出结果
就是k频繁项集合
i = 1 L= [[1], [2], [3], [4], [5]]
i = 2 L= [[1, 2], [1, 3], [1, 5], [2, 3], [2, 4], [2, 5]]
i = 3 L= [[1, 2, 3], [1, 2, 5]]
i = 4 L= []