数据挖掘概念与技术习题选做
第六章习题
(1) 用python简单实现Apriori算法
# -*- coding: utf-8 -*-
__author__ = "Yunfan Yang"
def gen_L1(TID):
"""从事务集中产生频繁1项集"""
initial_C1 = {} # 定义一个空字典用于统计初始项集信息,键值对形如{"M": 3}
for tid in TID:
for item in tid:
if item not in initial_C1.keys():
initial_C1[item] = 1
else:
initial_C1[item] += 1
# 候选1项集
C1 = []
for key, value in initial_C1.items():
tmp_tuple = ([key], value)
C1.append(tmp_tuple)
# C1结果为[(['M'], 3), (['O'], 4), (['N'], 2), (['K'], 5), (['E'], 4), (['Y'], 3), (['D'], 1), (['A'], 1), (['U'], 1), (['C'], 2), (['I'], 1)]
# 频繁1项集
L1 = []
for item in C1:
if item[1] / len(TID) >= min_sup:
L1.append(item)
# L1结果为[(['M'], 3), (['O'], 4), (['K'], 5), (['E'], 4), (['Y'], 3)]
return L1
def generateC_k(Lk_1):
"""产生候选k项集的函数"""
C_k = []
# 执行连接步骤
for i in range(len(Lk_1)-1): # 遍历Lk_1中的项集
# print(C)
for j in range(i+1, len(Lk_1)):
if len(Lk_1[i][0]) <= 1: # 频繁项集只有1项,则直接连接,产生候选项
C = Lk_1[i][0][:] # 复制Lk_1[i][0]给C
C.append(Lk_1[j][0][-1])
elif len(Lk_1[i][0]) > 1:
if if_equal(Lk_1[i][0][:-1], Lk_1[j][0][:-1]) and Lk_1[i][0][-1] != Lk_1[j][0][-1]: # 连接条件
C = Lk_1[i][0][:]