Apriori算法的python实现

1.数据格式

2.代码 

(1)自编写:速度更快(大规模数据集推荐)

import pandas as pd
from collections import Counter

def apriori(D, minSup):
    def get_support_count(D, key):
        return sum(1 for transaction in D if set(key).issubset(set(transaction)))
    C1 = Counter(item for transaction in D for item in transaction)
    keys1 = [[item] for item in C1 if C1[item] / len(D) >= minSup]
    keys = keys1
    all_keys = []
    support_counts = {}
    while keys:
        C = [get_support_count(D, key) for key in keys]
        cutKeys = [key for key, count in zip(keys, C) if count / len(D) >= minSup]
        for key, count in zip(cutKeys, C):
            support_counts[tuple(key)] = count / len(D)
        all_keys.extend(cutKeys)
        keys = aproiri_gen(cutKeys)
    return all_keys, support_counts

def aproiri_gen(keys1):
    keys2 = []
    for k1 in keys1:
        for k2 in keys1:
            if k1 != k2:
                key = sorted(list(set(k1) | set(k2)))
                if key not in keys2 and key not in keys1:
                    keys2.append(key)
    return keys2

def Dataset_generation(file):
    global mapping
    df = pd.read_csv(file,encoding='gbk')
    print(df)
    print(df.nunique())
    df_deduped = df.drop_duplicates(subset=[df.columns[0]])
    print(df_deduped)
    print(df_deduped.nunique())

    mapping = {val: idx for idx, val in enumerate(df['URL'].unique())}
    df['URL'] = df['URL'].map(mapping)
    df['URL'] = df['URL'].astype(str)

    ip_url_dict = {}
    for index, row in df_deduped.iterrows():
        ip = row['用户IP']
        urls = df[df['用户IP'] == ip]['URL'].tolist()
        ip_url_dict[ip] = urls
    return [v for k, v in ip_url_dict.items()]

new_list = Dataset_generation('acc_records(1).csv')
F, support_counts = apriori(new_list, 1e-2)
print("Frequent Itemsets:")
for itemset in F:
    mapped_itemset = [list(mapping.keys())[list(mapping.values()).index(int(item))] for item in itemset]
    print(mapped_itemset, "Support:", support_counts[tuple(itemset)])

(2)调用库:结构更简洁 

from mlxtend.frequent_patterns import apriori
import pandas as pd

def Dataset_generation(file):
    global mapping
    df = pd.read_csv(file,encoding='gbk')
    print(df)
    print(df.nunique())
    df_deduped = df.drop_duplicates(subset=[df.columns[0]])
    print(df_deduped)
    print(df_deduped.nunique())

    mapping = {val: idx for idx, val in enumerate(df['URL'].unique())}
    df['URL'] = df['URL'].map(mapping)
    df['URL'] = df['URL'].astype(str)

    ip_url_dict = {}
    for index, row in df_deduped.iterrows():
        ip = row['用户IP']
        urls = df[df['用户IP'] == ip]['URL'].tolist()
        ip_url_dict[ip] = urls
    return [v for k, v in ip_url_dict.items()]

new_list = Dataset_generation('acc_records(1).csv')
df = pd.DataFrame(new_list)
df = df.stack().str.get_dummies().groupby(level=0).sum().astype(bool)
frequent_itemsets = apriori(df, min_support=1e-2, use_colnames=True)

frequent_itemsets['itemsets'] = (frequent_itemsets['itemsets']
.apply(lambda x: [list(mapping.keys())[list(mapping.values())
.index(int(item))] for item in x]))
print(frequent_itemsets)

3.后续更新中

  • 4
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值