获取频繁项集和关联规则的Python实现【先验算法】

# -*- coding: utf-8 -*-
#参数设定
data_file = 'F:\\user_match_stat\\itemset.txt'
#文件格式csv,形如:item1,item2,item3
#每个事务占一行
frequent_itemsets_save_file = 'F:\\user_match_stat\\frequent_itemsets.txt'
rules_readable_file_dest = 'F:\\user_match_stat\\rules_readable.txt'
rules_csv_file_dest = 'F:\\user_match_stat\\rules_csv.txt'
rules_ranked_desc_by_liftrate = 'F:\\user_match_stat\\rules_liftrate_desc.txt'
#格式:itemset A,itemset B,support,confidence,liftrate
#itemset的项之间用|分隔
minsup = 0.01   #最小支持度,所有规则的支持度需要大于等于此值
minconf = 0.000001  #最小置信度
 
 
#通过计算得到的统计量
transaction_cnt = 0  #总事务数
min_sup_cnt = 0     #最小支持记数
transaction_cnt_distinct = 0 #总不同事务数
 
        
#全局数据结构
transaction_cnt_dict = {} # dict(tuple , int)  
frequent_itemsets_verified = {}  #dict(tuple , int)
frequent_itemsets_candidate = {}  #list(tuple , [set])
frequent_itemsets = {} #dict( int , dict(tuple , int) ) 保存所有的频率项集,第一个int是项集的长度
closed_frequent_itemsets = {} #保存所有闭频率项集
distinct_item_in_candidate_itemsets = set()
distinct_item_in_transaction_cnt_dict = set()
item_transaction_list_dict = {} # {元素 , set(tranidx in transaction_cnt_dict)}
hitted_transaction_set = set()
 
 
#获取事务集
def prepare_data() :
    global transaction_cnt_dict
    global frequent_itemsets_verified
    global frequent_itemsets_candidate
    global frequent_itemsets
    global distinct_item_in_candidate_itemsets
    global distinct_item_in_transaction_cnt_dict
    global transaction_cnt
    global min_sup_cnt
    global transaction_cnt_distinct
    global item_transaction_list_dict
    global hitted_transaction_set
    
    file = open(data_file)
    print 'Reading data from ' + data_file + '...'
    
    pre_transaction_cnt_dict = {}
    n = 0
    for line in file :   #读取事务列表
        line = line.strip()   #不然会有\n
        if line == '' :
            continue
        
        n = n + 1
        item_list = line.split(',')
        item_list.sort()
        tp = tuple(item_list)
        
        if tp in pre_transaction_cnt_dict :
            pre_transaction_cnt_dict[tp] = pre_transaction_cnt_dict[tp] + 1
        else :
            pre_transaction_cnt_dict[tp] = 1
        
        #测试用    
        #if n > 20000000000 :
        #    break
    
    #总事务数        
    transaction_cnt = n
    print 'Totally read ' + str(n) + ' lines.'
    file.close()
    
    #初始化transaction_cnt_dict和item_transaction_list_dict
    tranidx = 1
    for tp in pre_transaction_cnt_dict :
        transaction_cnt_dict[tranidx] = pre_transaction_cnt_dict[tp]
        for item in tp :
            if item in item_transaction_list_dict :
                item_transaction_list_dict[item].add(tranidx)
            else :
                item_transaction_list_dict[item] = set((tranidx,))
        tranidx = tranidx + 1
        
    del pre_transaction_cnt_dict
    
    transaction_cnt_distinct = len(transaction_cnt_dict)    
    min_sup_cnt = int(transaction_cnt * minsup)
    print 'The number of total transactions is '+str(transaction_cnt) + '.'
    print 'The number of distinct transactions is '+str(transaction_cnt_distinct) + '.'
    print 'The min support count is '+str(min_sup_cnt) + '.'
    print 'Function prepare_data done.'
    return
    
    
#得到频繁一项集,直接从item_transaction_list_dict里面统计即可        
def get_frequent_itemsets_1() :
    global transaction_cnt_dict
    global frequent_itemsets_verified
    global frequent_itemsets_candidate
    global frequent_itemsets
    global distinct_item_in_candidate_itemsets
    global distinct_item_in_transaction_cnt_dict
    global transaction_cnt
    global min_sup_cnt
    global transaction_cnt_distinct
    global item_transaction_list_dict
    global hitted_transaction_set
    
    hitted_transaction_set.clear()
    frequent_itemsets[1] = {}
    for item in item_transaction_list_dict :
        #cnt = len(item_transaction_list_dict[item]) 严重错误
        cnt = 0
        for tranidx in item_transaction_list_dict[item] :
            cnt = cnt + transaction_cnt_dict[tranidx]
        
        if cnt >= min_sup_cnt :
            frequent_itemsets[1][(item,)] = cnt
            #fill hitted_transaction_set    
            for tranidx in item_transaction_list_dict[item] :  
                hitted_transaction_set.add(tranidx)
   
    print 'Function get_frequent_itemsets_1 done'
    return
 
        
#获取候选项,根据frequent_itemsets_verified填充frequent_itemsets_candidate,清空frequent_itemsets_verified
#用Fk-1 * Fk-1法
#如果得不到新的K项,返回-1
def get_candida
  • 2
    点赞
  • 8
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值