apriori关联规则算法
1、apriori关联规则原理
从频繁项集生成关联规则的函数
2、apriori关联规则算法实现
第一步、导入原始数据,统计出现的数据种类
# 导入包
import mlxtend
import numpy as np
import pandas as pd
from mlxtend.frequent_patterns import apriori, fpmax, fpgrowth, association_rules
# 打开数据
data = pd.read_csv('./apriori.txt', header=None, dtype=object)
a, b = data.shape # 930,7
# 统计出现的数据种类
df = data
all_kind = []
for i in range(a):
for j in range(b):
all_kind.append(df.loc[i, j])
from collections import Counter
result = Counter(all_kind)
print('各数据出现的次数:\n--------------------\n', pd.DataFrame([list(result.keys()), list(result.values())],
index=['数据', '次数'])) # 6510
第二步,将原始数据转为换布尔型
kinds = list(result.keys())
# 创建0矩阵数据框
data_ = pd.DataFrame(np.zeros([a, len(kinds)]))
data_.columns = kinds
# 转布尔型True/False
for i in range(a):
for j in range(b):
if df.loc[i, j] in kinds:
data_.loc[i, df.loc[i, j]] = True
df_ = data_.replace(0, False)
print('转为布尔类型后的数据:\n-------------------\n', df_)
# 验证True个数(可省略)
num_ = []
for i in range(a):
for j in range(len(kinds)):
num_.append(df_.loc[i, kinds[j]])
print(Counter(num_))
第三步、计算频繁项集
frequent_itermsets = apriori(df_, min_support=0.06, use_colnames=True) # 最小支持度设为0.06
print('频繁项集:\n-----------------------\n', pd.DataFrame(frequent_itermsets))
第四步、根据置信度筛选关联规则
result = association_rules(frequent_itermsets, metric='confidence', min_threshold=0.75) # 最小置信度设为0.75,根据置信度筛选
print('筛选之后的关联规则:\n-----------------------\n', result[['antecedents', 'consequents', 'support', 'confidence']])
第五步、输出结果、得出结论
筛选之后的关联规则:
-----------------------
antecedents consequents support confidence
0 (F4, C3) (H4) 0.075269 0.875000
1 (F4, B2) (H4) 0.062366 0.794521
2 (A3, F4) (H4) 0.078495 0.879518
3 (E3, C2) (D2) 0.092473 0.754386
4 (H4, D2, F3) (A2) 0.062366 0.753247