用Python实现Apriori算法和FP-growth算法

吾研

已于 2022-09-22 14:58:26 修改

阅读量802

点赞数 7

文章标签： python 算法数据挖掘

于 2022-09-22 12:00:07 首次发布

原文链接：http://www.tybai.com/ml/FP-tree%E6%8E%A8%E8%8D%90%E7%AE%97%E6%B3%95.html；https://article.itxueyuan.com/jA91L

版权

Apriori算法代码：

def load_data_set():
    """
    加载样本数据集
    返回：
         数据集：交易列表。 每个事务包含几个项目。
    """
    data_set = [['A', 'C', 'S','l'], ['D', 'A','C','E','B'], ['A', 'B','C'],
            ['C', 'A', 'B','E']]
    return data_set


def create_C1(data_set):
    """
   通过扫描数据集创建频繁候选 1-itemset C1。
     参数：
         data_set：交易列表。 每个事务包含几个项目。
     返回：
         C1：包含所有频繁候选 1 项集的集合
    """
    C1 = set()
    for t in data_set:
        for item in t:
            item_set = frozenset([item])
            C1.add(item_set)
    return C1


def is_apriori(Ck_item, Lksub1):
    """
    判断一个频繁候选k-itemset是否满足Apriori性质。
     参数：
         Ck_item：Ck中的一个频繁候选k-itemset，包含所有频繁
                  候选 k 项集。
         Lksub1：Lk-1，一个包含所有频繁候选（k-1）项集的集合。
     返回：
         真：满足 Apriori 属性。
         False：不满足 Apriori 属性。
    """
    for item in Ck_item:
        sub_Ck = Ck_item - frozenset([item])
        if sub_Ck not in Lksub1:
            return False
    return True


def create_Ck(Lksub1, k):
    """
    创建 Ck，一个包含所有频繁候选 k 项集的集合
    通过Lk-1自己的连接操作。
     参数：
         Lksub1：Lk-1，一个包含所有频繁候选（k-1）项集的集合。
         k：频繁项集的项号。
     返回：
         Ck：包含所有频繁候选 k 项集的集合。
    """
    Ck = set()
    len_Lksub1 = len(Lksub1)
    list_Lksub1 = list(Lksub1)
    for i in range(len_Lksub1):
        for j in range(1, len_Lksub1):
            l1 = list(list_Lksub1[i])
            l2 = list(list_Lksub1[j])
            l1.sort()
            l2.sort()
            if l1[0:k-2] == l2[0:k-2]:
                Ck_item = list_Lksub1[i] | list_Lksub1[j]
                # pruning
                if is_apriori(Ck_item, Lksub1):
                    Ck.add(Ck_item)
    return Ck


def generate_Lk_by_Ck(data_set, Ck, min_support, support_data):
    """
   通过从 Ck 执行删除策略来生成 Lk。
     参数：
         data_set：交易列表。 每个事务包含几个项目。
         Ck：包含所有频繁候选 k 项集的集合。
         min_support：最小支持。
         support_data：字典。 关键是频繁项集，值是支持度。
     返回：
         Lk：包含所有频繁 k 项集的集合。
    """
    Lk = set()
    item_count = {}
    for t in data_set:
        for item in Ck:
            if item.issubset(t):
                if item not in item_count:
                    item_count[item] = 1
                else:
                    item_count[item] += 1
    t_num = float(len(data_set))
    for item in item_count:
        if (item_count[item] / t_num) >= min_support:
            Lk.add(item)
            support_data[item] = item_count[item] / t_num
    return Lk


def generate_L(data_set, k, min_support):
    """
    生成所有频繁项集。
     参数：
         data_set：交易列表。 每个事务包含几个项目。
         k：所有频繁项集的最大项数。
         min_support：最小支持。
     返回：
         L：Lk的名单。
         support_data：字典。 关键是频繁项集，值是支持度。
    """
    support_data = {}
    C1 = create_C1(data_set)
    L1 = generate_Lk_by_Ck(data_set, C1, min_support, support_data)
    Lksub1 = L1.copy()
    L = []
    L.append(Lksub1)
    for i in range(2, k+1):
        Ci = create_Ck(Lksub1, i)
        Li = generate_Lk_by_Ck(data_set, Ci, min_support, support_data)
        Lksub1 = Li.copy()
        L.append(Lksub1)
    return L, support_data


def generate_big_rules(L, support_data, min_conf):
    """
   从频繁项集中生成大规则。
     参数：
         L：Lk的名单。
         support_data：字典。 关键是频繁项集，值是支持度。
         min_conf：最小的信心。
     回报：
         big_rule_list：包含所有大规则的列表。 每个大规则都被表示
                        作为一个三元组。
    """
    big_rule_list = []
    sub_set_list = []
    for i in range(0, len(L)):
        for freq_set in L[i]:
            for sub_set in sub_set_list:
                if sub_set.issubset(freq_set):
                    conf = support_data[freq_set] / support_data[freq_set - sub_set]
                    big_rule = (freq_set - sub_set, sub_set, conf)
                    if conf >= min_conf and big_rule not in big_rule_list:
                        # print freq_set-sub_set, " => ", sub_set, "conf: ", conf
                        big_rule_list.append(big_rule)
            sub_set_list.append(freq_set)
    return big_rule_list


if __name__ == "__main__":
    """
    Test
    """
    data_set = load_data_set()
    L, support_data = generate_L(data_set, k=3, min_support=0.6) #minsup=60%
    big_rules_list = generate_big_rules(L, support_data, min_conf=0.8)  #minconf=80%
    for Lk in L:
        print("="*50)
        print("frequent " + str(len(list(Lk)[0])) + "-itemsetsttsupport")
        print("="*50)
        for freq_set in Lk:
            print(freq_set, support_data[freq_set])
    print
    print("Big Rules")
    for item in big_rules_list:
        print(item[0], "=>", item[1], "conf: ", item[2])

效果图：

FP-tree算法：

# !/usr/bin/python3.4
# -*- coding: utf-8 -*-

from collections import Counter


# 遍历数据，进行计数
def countitem(array):
    temp = []
    for item in array:
        for value in item:
            temp.append(value)

    # 写入字典
    dict = {}
    for key in Counter(temp).keys():
        dict[key] = Counter(temp)[key]

    # {'G': 2, 'B': 7, 'D': 6, 'A': 3, 'E': 4, 'C': 8, 'F': 1}
    return dict


# 删除支持度不够的key
def deletekey(dict, support):
    temp = dict.copy()
    detele = []
    for key in dict.keys():
        if dict[key] < support:
            temp.pop(key)
            detele.append(key)
    # {'A': 3, 'B': 7, 'E': 4, 'D': 6, 'C': 8}
    # ['F', 'G']
    return temp, detele


# 得到从大到小排序的数组
def sorfarray(array, dict, delect):
    newarray = []
    # 删除支持度不够的元素
    for item in array:
        temp = {}
        for value in item:
            if value in delect:
                pass
            else:
                # 排除被删除的元素
                # [['E', 'B', 'C'], ['D', 'C'], ['B', 'A', 'C'], ['B', 'D'], ['D', 'C', 'B'], ['E', 'A', 'C'], ['D', 'C'], ['A', 'E', 'B'], ['B', 'C', 'D'], ['E', 'C', 'B', 'D']]
                temp[value] = dict[value]
        temp = sorted(temp.items(), key=lambda d: d[1], reverse=True)
        # 排序后得到
        # [('C', 8), ('B', 7), ('E', 4)]
        # [('C', 8), ('D', 6)]
        # [('C', 8), ('B', 7), ('A', 3)]
        # [('B', 7), ('D', 6)]
        # [('C', 8), ('B', 7), ('D', 6)]
        # [('C', 8), ('E', 4), ('A', 3)]
        # [('C', 8), ('D', 6)]
        # [('B', 7), ('E', 4), ('A', 3)]
        # [('C', 8), ('B', 7), ('D', 6)]
        # [('C', 8), ('B', 7), ('D', 6), ('E', 4)]
        # temp[0][0] = C
        tem = []
        for tuple in temp:
            tem.append(tuple[0])
        newarray.append(tem)
    # 得到排序后的新数组
    # [['C', 'B', 'E'], ['C', 'D'], ['C', 'B', 'A'], ['B', 'D'], ['C', 'B', 'D'], ['C', 'E', 'A'], ['C', 'D'], ['B', 'E', 'A'], ['C', 'B', 'D'], ['C', 'B', 'D', 'E']]
    return newarray


# info里面元素的种类
def getkinds(array):
    temp = []
    for item in array:
        for value in item:
            if value in temp:
                pass
            else:
                temp.append(value)
    # ['C', 'B', 'E', 'D', 'A']
    # ['A', 'B', 'C', 'D', 'E']
    return sorted(temp)


# 得到每一个种类的所有路径
def getrootpath(kinds, newinfo, dict):
    allinfo = {}
    for kind in kinds:
        kindarr = []
        for item in newinfo:
            # 如果这一条路径包含某个种类
            itemarr = []
            if kind in item:
                for value in item:
                    if kind == value:
                        break
                    else:
                        itemarr.append(value)
            if itemarr:
                kindarr.append(itemarr)
        # print(kind, kindarr)
        # A [[('C', 8), ('B', 7)], [('C', 8), ('E', 4)], [('B', 7), ('E', 4)]]
        # B [[('C', 8)], [('C', 8)], [('C', 8)], [('C', 8)], [('C', 8)]]
        # C []
        # D [[('C', 8)], [('B', 7)], [('C', 8), ('B', 7)], [('C', 8)], [('C', 8), ('B', 7)], [('C', 8), ('B', 7)]]
        # E [[('C', 8), ('B', 7)], [('C', 8)], [('B', 7)], [('C', 8), ('B', 7), ('D', 6)]]
        allinfo[kind] = kindarr

    return allinfo


# 得到所有组合的字典
def getrange(rootpath):
    alldict = {}
    for key in rootpath.keys():
        root = rootpath[key]
        # 一个元素的路径
        onearr = []
        dict = {}

        # 实现一个元素路径
        for item in root:
            for value in item:
                onearr.append(value)
                dict[value] = onearr.count(value)
        alldict[key] = dict
        # {'B': {'C': 5}, 'C': {}, 'E': {'C': 3, 'B': 3, 'D': 1}, 'A': {'E': 2, 'C': 2, 'B': 2}, 'D': {'C': 5, 'B': 4}}

        # 实现两个元素路径
        for item1 in root:
            tempdict = {}
            for item2 in root:
                if item1 == item2:
                    if len(item1) > 1:
                        x = "".join(item1)
                        if x in tempdict.keys():
                            tempdict[x] += 1
                        else:
                            tempdict[x] = 1
            # print(tempdict)
            if tempdict:
                for x in tempdict:
                    alldict[key][x] = tempdict[x]
    # print(alldict)
    # {'D': {'CB': 3, 'C': 5, 'B': 4}, 'A': {'E': 2, 'B': 2, 'CB': 1, 'C': 2, 'BE': 1, 'CE': 1}, 'E': {'D': 1, 'C': 3, 'CB': 1, 'B': 3, 'CBD': 1}, 'B': {'C': 5}, 'C': {}}

    return alldict


# 得到每个种类的置信度
def confidence(alldict, support, newinfo):
    newdict = {}
    for kind in alldict:
        copydict = alldict[kind].copy()
        for key in alldict[kind]:
            if alldict[kind][key] < support:
                copydict.pop(key)
        if copydict:
            newdict[kind] = copydict
    # print(newdict)
    # {'E': {'C': 3, 'B': 3}, 'B': {'C': 5}, 'D': {'C': 5, 'CB': 3, 'B': 4}}

    # 计算置信度
    for kind in newdict:
        for key in newdict[kind].keys():
            tempnum = newdict[kind][key]
            denominator = 0
            for item in newinfo:
                if len(key) == 1:
                    if key in item:
                        denominator += 1
                elif len(key) == 2:
                    if key[0] in item and key[1] in item:
                        denominator += 1
                elif len(key) == 3:
                    if key[0] in item and key[1] in item and key[2] in item:
                        denominator += 1

            newdict[kind][key] = str(tempnum) + "/" + str(denominator)
    # {'E': {'B': '3/7', 'C': '3/8'}, 'B': {'C': '5/8'}, 'D': {'B': '4/7', 'C': '5/8', 'CB': '3/5'}}
    # 买了C人，有3/8概率买E，有5/8概率买B，有5/8概率买D，且买了C又买了B的人有3/5的概率买D
    return newdict


if __name__ == '__main__':
    support = 3
    info = [["A", "C", "S","L"], ["D", "A","C","E","B"], ["A", "B", "C"], ["C", "A","B","E"]]

    # 遍历数据，进行计数
    dict = countitem(info)
    # 删除支持度不够的key
    dict, delete = deletekey(dict, support)
    # 得到从大到小排序的数组
    newinfo = sorfarray(info, dict, delete)
    # info里面元素的种类
    kinds = getkinds(newinfo)
    # 得到每一个种类的所有路径
    rootpath = getrootpath(kinds, newinfo, dict)
    # 得到所有组合的字典
    alldict = getrange(rootpath)
    # 得到每个种类的置信度
    confidence(alldict, support, newinfo)
print(dict)
print("--------------------------------------------------------------------------------")
print(alldict)
print("--------------------------------------------------------------------------------")
print(confidence(alldict,support,newinfo))
print("--------------------------------------------------------------------------------")

效果图：