python实现fp_growth调包_FP-Growth Alogrithm 的Python实现

本文介绍了使用Python实现FP-Growth算法的详细过程,包括FP-Tree的构造、更新、挖掘等步骤。通过实例展示了如何从数据集中创建FP-Tree,并进行频繁项集的挖掘。代码包括了树节点类的定义、更新项头表、更新树、建造树以及挖掘树的函数。
摘要由CSDN通过智能技术生成

算法原理可见Superman:FP-Growth算法简介​zhuanlan.zhihu.com

构造节点类:

使用关键字class为FP-tree中的节点构造节点类,包括节点类的属性、用来统计节点出现次数的方法和显示FP-tree的方法,节点类的属性有节点名、节点出现的次数、用与链接所有相同节点的node_link、节点的父节点和子节点。代码如下:

class tree_node(): #node class include properties and methods of node

def __init__(self,name_value,num_occur,parent_node):

self.name = name_value #value is node string

self.count = num_occur #value is int

self.node_link = None #value is class_node item header link

self.parent = parent_node #value is class_node

self.children = {} #content is node.name : class_node

def inc(self,num_occur):

self.count += num_occur #count node

def displaces(self,ind = 1):

print(' '*ind, self.name, ' ', self.count)

for child in self.children.values():

child.displaces(ind + 1) #iteration output

更新项头表:

项头表可以使遍历树更加快速,它是一个横向迭代向前并判断的思想。代码如下:

#update items header forms

def update_header(node_test, target_node):

while (node_test.node_link != None):

node_test = node_test.node_link

node_test.node_link = target_node

更新树

更新树是一种递归的思想,首先判断节点是否存在,如果存在,则节点合并,并且记录节点的个数加一,否则创建新的节点,并更新项头表。代码如下:

#update iteratly tree (top to down)

def update_tree(items, in_tree, header_table, count):

if items[0] in in_tree.children:

in_tree.children[items[0]].inc(count)

else:

in_tree.children[items[0]] = tree_node(items[0], count, in_tree) #build branch

if header_table[items[0]][1] == None:

header_table[items[0]][1] = in_tree.children[items[0]]

else:

#update content of header form

update_header(header_table[items[0]][1], in_tree.children[items[0]])

if len(items) > 1:

update_tree(items[1:], in_tree.children[items[0]], header_table, count)

建造树

下面就开始建造树,首先要扫描整个数据库,根据最小支持度,获得频繁1-项集。然后根据频繁1-项集为每一件事物数据按照降序排序,并调用update_tree来建造树。具体代码如下:

#build FP-Tree

def creat_tree(data_set, min_sup=1): #data_set is dictionary

header_table = {}

for trans in data_set:

for item in trans:

header_table[item] = header_table.get(item, 0) + data_set[trans]

for item_1 in list(header_table.keys()):

if header_table[item_1] < min_sup:

del (header_table[item_1])

fre_item_set = set(header_table.keys())

if len(fre_item_set) == 0:

return None,None

for item_1 in header_table:

# items header table includes name and node count and address of node

header_table[item_1] = [header_table[item_1], None]

ret_tree = tree_node("Null Set", 1, None) # root node

for trans_set, count in data_set.items():

local_id = {}

for item_1 in trans_set:

if item_1 in fre_item_set:

local_id[item_1] = header_table[item_1][0]

if len(local_id) > 0:

order_set = [v[0] for v in sorted(local_id.items(), key=lambda p: p[1],\

reverse=True)]

update_tree(order_set, ret_tree, header_table, count)

return ret_tree,header_table

例子

以一个简单的数据库来说明,如下所示:

def local_data():

test_data = [['I1','I2','I5'],

['I2','I4'],

['I2','I3'],

['I1','I2','I4'],

['I1','I3'],

['I2','I3'],

['I1','I3'],

['I1','I2','I3','I5'],

['I1','I2','I3']]

return test_data

def creat_set(data_set):

ret_dic = {}

for trans in data_set:

ret_dic[frozenset(trans)] = ret_dic.get(frozenset(trans),0) + 1

return ret_dic

以上是建造FP-Tree树的过程,下面开始挖掘树。首先要找到所有前缀路径,然后创建条件FP-Tree。之后就递归调用mine_tree函数。

具体代码如下所示:

#search prefix tree

def before_tree(header_table_node,bef_path):

if header_table_node.parent != None:

bef_path.append(header_table_node.name)

before_tree(header_table_node.parent,bef_path)

#search all prefix tree of the same node

def find_path(base_pat,tree_node):

cond_pats = {}

while tree_node != None:

pre_path = []

before_tree(tree_node,pre_path)

if len(pre_path) > 1:

cond_pats[frozenset(pre_path[1:])] = tree_node.count

tree_node = tree_node.node_link

return cond_pats

#mine tree

def mine_tree(in_tree,header_table,min_sup,pre_path,fre_item_set,fre_item_count):

fre_item_1 = [v[0] for v in sorted(header_table.items(),key = lambda p:p[1][0])]

for base_pat in fre_item_1:

new_fre_set = pre_path.copy()

new_fre_set.add(base_pat)

#caculate support count of frequent itemset

fre_item_count[frozenset(new_fre_set)] = header_table[base_pat][0]

fre_item_set.append(new_fre_set)

cond_pat_path = find_path(base_pat,header_table[base_pat][1])

my_tree,my_header = creat_tree(cond_pat_path,min_sup)

print("condition tree for :",new_fre_set)

if my_tree != None:

my_tree.displaces(1)

if my_header != None:

mine_tree(my_tree,my_header,min_sup,new_fre_set,fre_item_set,fre_item_count)

return fre_item_set,fre_item_count

注意:

节点类可以单独放一个文件里,也可以在同一个文件内使用。单独使用时,节点类放在file_name.py文件内。需要添加import file_name语句。

可执行代码如下所示:

import csv

class tree_node(): #node class include properties and methods of node

def __init__(self,name_value,num_occur,parent_node):

self.name = name_value #value is node string

self.count = num_occur #value is int

self.node_link = None #value is class_node

self.parent = parent_node #value is class_node

self.children = {} #content is node.name : class_node

def inc(self,num_occur):

self.count += num_occur #count node

def displaces(self,ind = 1):

print(' '*ind, self.name, ' ', self.count)

for child in self.children.values():

child.displaces(ind + 1) #iteration output

#update items header forms

def update_header(node_test, target_node):

while (node_test.node_link != None):

node_test = node_test.node_link

node_test.node_link = target_node

#update iteratly tree (top to down)

def update_tree(items, in_tree, header_table, count):

if items[0] in in_tree.children:

in_tree.children[items[0]].inc(count)

else:

in_tree.children[items[0]] = tree_node(items[0], count, in_tree) #build branch

if header_table[items[0]][1] == None:

header_table[items[0]][1] = in_tree.children[items[0]]

else:

update_header(header_table[items[0]][1], in_tree.children[items[0]])

if len(items) > 1:

update_tree(items[1:], in_tree.children[items[0]], header_table, count)

#build FP-Tree

def creat_tree(data_set, min_sup=1): #data_set is dictionary

header_table = {}

for trans in data_set:

for item in trans:

header_table[item] = header_table.get(item, 0) + data_set[trans]

for item_1 in list(header_table.keys()):

if header_table[item_1] < min_sup:

del (header_table[item_1])

fre_item_set = set(header_table.keys())

if len(fre_item_set) == 0:

return None,None

for item_1 in header_table:

# items header table includes name and node count and address of node

header_table[item_1] = [header_table[item_1], None]

ret_tree = tree_node("Null Set", 1, None) # root node

for trans_set, count in data_set.items():

local_id = {}

for item_1 in trans_set:

if item_1 in fre_item_set:

local_id[item_1] = header_table[item_1][0]

if len(local_id) > 0:

order_set = [v[0] for v in sorted(local_id.items(), key=lambda p: p[1], \

reverse=True)]

update_tree(order_set, ret_tree, header_table, count)

return ret_tree,header_table

'''def local_data():

test_data = [['r','z','h','j','p'],

['z','y','x','w','v','u','t','s'],

['z'],

['r','x','n','o','s'],

['y','r','x','z','q','t','p'],

['y','z','x','e','q','s','t','m']]

return test_data'''

#load data from csv file

'''def load_file(file_path):

try:

with open(file_path,'r') as f_customer_data,\

open("output_datafile.csv",'w',newline = "") as out_file:

reader = csv.reader(f_customer_data) #type is class

writer = csv.writer(out_file) #type is class

#print(type(reader))

header_row = next(reader)

writer.writerow(header_row) #write head information of file to out_file

print(header_row)

highs = []

highs_2 = []

null_character = ""

for row in reader:

if null_character in row: #Judging an element in list

continue

else:

writer.writerow(row[1::]) #write row to out_file 行:row

highs.append(row[1::])

except FileNotFoundError:

print("Sorry, the file"+file_path+"does not exist.")

return highs'''

#example dataset

def load_data():

test_data = [['I1','I2','I5'],

['I2','I4'],

['I2','I3'],

['I1','I2','I4'],

['I1','I3'],

['I2','I3'],

['I1','I3'],

['I1','I2','I3','I5'],

['I1','I2','I3']]

return test_data

def creat_set(data_set):

ret_dic = {}

for trans in data_set:

ret_dic[frozenset(trans)] = ret_dic.get(frozenset(trans),0) + 1

return ret_dic

#search prefix tree

def before_tree(header_table_node,bef_path):

if header_table_node.parent != None:

bef_path.append(header_table_node.name)

before_tree(header_table_node.parent,bef_path)

#search all prefix tree of the same node

def find_path(base_pat,tree_node):

cond_pats = {}

while tree_node != None:

pre_path = []

before_tree(tree_node,pre_path)

if len(pre_path) > 1:

cond_pats[frozenset(pre_path[1:])] = tree_node.count

tree_node = tree_node.node_link

return cond_pats

#mine tree

def mine_tree(in_tree,header_table,min_sup,pre_path,fre_item_set,fre_item_count):

fre_item_1 = [v[0] for v in sorted(header_table.items(),key = lambda p:p[1][0])]

for base_pat in fre_item_1:

new_fre_set = pre_path.copy()

new_fre_set.add(base_pat)

#support count of frequent itemset

fre_item_count[frozenset(new_fre_set)] = header_table[base_pat][0]

fre_item_set.append(new_fre_set)

cond_pat_path = find_path(base_pat,header_table[base_pat][1])

my_tree,my_header = creat_tree(cond_pat_path,min_sup)

print("condition tree for :",new_fre_set)

if my_tree != None:

my_tree.displaces(1)

if my_header != None:

mine_tree(my_tree,my_header,min_sup,new_fre_set,fre_item_set,fre_item_count)

return fre_item_set,fre_item_count

#calculate support rate %

def support_grate(fre_item_count,trans_dic):

total_trans = sum(trans_dic.values())

for item_set in fre_item_count.keys():

set_grate[item_set] = float(fre_item_count[item_set]/total_trans)

return set_grate

#file_path = "trafficSceneData.csv"

fre_item = []

fre_item_count = {}

set_grate = {}

sim_data = load_data()

set_data = creat_set(sim_data)

my_data_tree, my_header_table = creat_tree(set_data, 2)

my_data_tree.displaces() #print FP-Tree

fre_item,fre_item_count = mine_tree(my_data_tree,my_header_table,2,set([])\

,fre_item,fre_item_count)

grate_sup = support_grate(fre_item_count,set_data)

print(fre_item)

print(fre_item_count)

print(grate_sup)

结果

参考《机器学习实战》

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值