# -*- coding: utf-8 -*-
#参数设定
data_file = 'F:\\user_match_stat\\itemset.txt'
#文件格式csv,形如:item1,item2,item3
#每个事务占一行
frequent_itemsets_save_file = 'F:\\user_match_stat\\frequent_itemsets.txt'
rules_readable_file_dest = 'F:\\user_match_stat\\rules_readable.txt'
rules_csv_file_dest = 'F:\\user_match_stat\\rules_csv.txt'
rules_ranked_desc_by_liftrate = 'F:\\user_match_stat\\rules_liftrate_desc.txt'
#格式:itemset A,itemset B,support,confidence,liftrate
#itemset的项之间用|分隔
minsup = 0.01 #最小支持度,所有规则的支持度需要大于等于此值
minconf = 0.000001 #最小置信度
#通过计算得到的统计量
transaction_cnt = 0 #总事务数
min_sup_cnt = 0 #最小支持记数
transaction_cnt_distinct = 0 #总不同事务数
#全局数据结构
transaction_cnt_dict = {} # dict(tuple , int)
frequent_itemsets_verified = {} #dict(tuple , int)
frequent_itemsets_candidate = {} #list(tuple , [set])
frequent_itemsets = {} #dict( int , dict(tuple , int) ) 保存所有的频率项集,第一个int是项集的长度
closed_frequent_itemsets = {} #保存所有闭频率项集
distinct_item_in_candidate_itemsets = set()
distinct_item_in_transaction_cnt_dict = set()
item_transaction_list_dict = {} # {元素 , set(tranidx in transaction_cnt_dict)}
hitted_transaction_set = set()
#获取事务集
def prepare_data() :
global transaction_cnt_dict
global frequent_itemsets_verified
global frequent_itemsets_candidate
global frequent_itemsets
global distinct_item_in_candidate_itemsets
global distinct_item_in_transaction_cnt_dict
global transaction_cnt
global min_sup_cnt
global transaction_cnt_distinct
global item_transaction_list_dict
global hitted_transaction_set
file = open(data_file)
print 'Reading data from ' + data_file + '...'
pre_transaction_cnt_dict = {}
n = 0
for line in file : #读取事务列表
line = line.strip() #不然会有\n
if line == '' :
continue
n = n + 1
item_list = line.split(',')
item_list.sort()
tp = tuple(item_list)
if tp in pre_transaction_cnt_dict :
pre_transaction_cnt_dict[tp] = pre_transaction_cnt_dict[tp] + 1
else :
pre_transaction_cnt_dict[tp] = 1
#测试用
#if n > 20000000000 :
# break
#总事务数
transaction_cnt = n
print 'Totally read ' + str(n) + ' lines.'
file.close()
#初始化transaction_cnt_dict和item_transaction_list_dict
tranidx = 1
for tp in pre_transaction_cnt_dict :
transaction_cnt_dict[tranidx] = pre_transaction_cnt_dict[tp]
for item in tp :
if item in item_transaction_list_dict :
item_transaction_list_dict[item].add(tranidx)
else :
item_transaction_list_dict[item] = set((tranidx,))
tranidx = tranidx + 1
del pre_transaction_cnt_dict
transaction_cnt_distinct = len(transaction_cnt_dict)
min_sup_cnt = int(transaction_cnt * minsup)
print 'The number of total transactions is '+str(transaction_cnt) + '.'
print 'The number of distinct transactions is '+str(transaction_cnt_distinct) + '.'
print 'The min support count is '+str(min_sup_cnt) + '.'
print 'Function prepare_data done.'
return
#得到频繁一项集,直接从item_transaction_list_dict里面统计即可
def get_frequent_itemsets_1() :
global transaction_cnt_dict
global frequent_itemsets_verified
global frequent_itemsets_candidate
global frequent_itemsets
global distinct_item_in_candidate_itemsets
global distinct_item_in_transaction_cnt_dict
global transaction_cnt
global min_sup_cnt
global transaction_cnt_distinct
global item_transaction_list_dict
global hitted_transaction_set
hitted_transaction_set.clear()
frequent_itemsets[1] = {}
for item in item_transaction_list_dict :
#cnt = len(item_transaction_list_dict[item]) 严重错误
cnt = 0
for tranidx in item_transaction_list_dict[item] :
cnt = cnt + transaction_cnt_dict[tranidx]
if cnt >= min_sup_cnt :
frequent_itemsets[1][(item,)] = cnt
#fill hitted_transaction_set
for tranidx in item_transaction_list_dict[item] :
hitted_transaction_set.add(tranidx)
print 'Function get_frequent_itemsets_1 done'
return
#获取候选项,根据frequent_itemsets_verified填充frequent_itemsets_candidate,清空frequent_itemsets_verified
#用Fk-1 * Fk-1法
#如果得不到新的K项,返回-1
def get_candida
获取频繁项集和关联规则的Python实现【先验算法】
最新推荐文章于 2024-04-22 23:03:04 发布