关联分析python代码

最新推荐文章于 2023-09-16 14:00:00 发布

故_kj

最新推荐文章于 2023-09-16 14:00:00 发布

阅读量1.5k

点赞数

文章标签： python 机器学习算法

本文链接：https://blog.csdn.net/qq_57494293/article/details/123209387

版权

import numpy as np
import pandas as pd
import itertools


def beautify(arr1, arr2):  # 美化关联规则的输出形式（不重要）
    arr_bool = ~np.isin(arr2, arr1)
    front = set(arr1)
    back = set(arr2[arr_bool])
    return '{}-->{}'.format(front, back)


def get_sublist(item):     # 获取列表item的所有非空子列表并返回由所有子列表构成的列表 类似于事件域
    length = len(item)
    sub = []
    for i in range(1, length+1):
        sub.extend(list(itertools.combinations(item, i)))
    return sub


def association_analysis(df, support=0.4, confidence=0.5):
    """
    使用Apriori算法进行关联分析，从数据集中找出具有关联规则的物品

    Parameters
            ----------
            df : DataFrame(bool)
                每一行表示一条数据， 每一列表示一种商品， 一条交易数据中包含某种商品则在该商品对应列取值为True， 反之为False
            support: float = 0.4
            confidence: float = 0.5

    Returns
            -------
            rule : list
                列表的每个元素表示一条关联规则
    """
    col = df.loc[:, (df.mean(axis=0) > support)]  # col为1-子项集中的频繁集
    label = col.columns.to_numpy()
    height, width = col.shape                     # width为1-子项集中的频繁集个数
    n = 2
    rule = []                                     # 保存关联规则
    inconf = []                                   # 保存低可信度规则的前件
    infreq_if = np.empty(shape=(0, n))            # 保存非频繁集
    while n <= width:
        infreq_con = np.empty(shape=(0, n))       # 保存用于下次（n+1）判断的非频繁集
        comb = np.array(list(itertools.combinations(label, n)))                # 列举出所有n-子项集
        for label2 in comb:
            if np.isin(infreq_if, label2).all(axis=1).any():                   # Apriori算法
                continue
            else:
                col_sup = col[label2].all(axis=1).mean()                       # col_sup为支持度
                if col_sup > support:
                    for i in range(n-1, 0, -1):                                # 找出频繁集中的关联规则 前件元素个数为i
                        c = np.array(list(itertools.combinations(label2, i)))  # c为前件个数为i时所有可能存在的前件（关联规则）
                        for j in c:
                            if tuple(j) in inconf:                             # Apriori算法
                                continue
                            else:
                                conf = col_sup/(col[j].all(axis=1).mean())     # conf为可信度
                                if conf > confidence:
                                    rule.append(beautify(j, label2))
                                else:
                                    sub = get_sublist(j.tolist())
                                    inconf.extend(sub)
                else:
                    infreq_con = np.concatenate((infreq_con, label2.reshape(1, n)), axis=0)
        infreq_if = infreq_con
        n += 1
    return rule


def data_set(n, p, seed=20220227):      # 随机选一组数据
    np.random.seed(seed)
    arr = np.random.choice([False, True], size=(n, p))
    return pd.DataFrame(arr, columns=range(p))


if __name__ == '__main__':
    data = data_set(70, 20)
    print(association_analysis(data))
    # ['{9}-->{18}', '{18}-->{9}', '{13}-->{18}', '{18}-->{13}']

故_kj

关注

0
点赞
踩
7

收藏

觉得还不错? 一键收藏
0
评论
关联分析python代码

import numpy as npimport pandas as pdimport itertoolsdef beautify(arr1, arr2): # 美化关联规则的输出形式（不重要） arr_bool = ~np.isin(arr2, arr1) front = set(arr1) back = set(arr2[arr_bool]) return '{}-->{}'.format(front, back)def get_sublist(.
复制链接

扫一扫