import time
class TreeNode:
def __init__(self, item, count, parent):
self.item = item # 元素项
self.count = count # 支持度计数
self.parent = parent # 父节点
self.children = {} # 子节点
self.node_link = None # 链接
def create_tree(data, min_support):
"""
Param:
data: 数据集
min_support: 最小支持度
Procedure:
1. 第一遍扫描数据库,得到每个元素的支持度
2. 剔除小于最小支持度的值,得到一项频繁集的头节点表
3. 对表格根据支持度排序,创建下表对应表格方便后续的排序
4. 第二次扫描数据库,对每一条记录根据下表对应表格排序,并且提出小于最小支持度的项吗,然后构建FP-Tree
return:
headTable:头节点表
root:FP-Tree的根节点
"""
# 第一次扫描数据集,统计每个元素项的支持度计数
header_table = {}
for transaction in data:
for item in transaction:
if header_table.get(item) is None:
header_table[item] = [data[transaction], None]
else:
header_table[item][0] += data[transaction]
# 移除支持度小于min_support的元素项
for item in list(header_table.keys()):
if header_table[item][0] < min_support:
del header_table[item]
# 如果所有元素项的支持度都小于min_support,则无频繁项集
if len(header_table) == 0:
return None, None
# 对header_table排序,构建下表映射,方便后面对transaction排序
sorted_items = sorted(header_table.items(), key=lambda x: x[1])
index = {}
for i in range(len(sorted_items)):
index[sorted_items[i][0]] = i
# 建立FP-Tree的根节点
root = TreeNode(None, 1, None)
# 第二次扫描数据集,构建FP-Tree
for transaction, count in data.items():
filtered_transaction = [item for item in transaction if item in header_table]
filtered_transaction = sorted(filtered_transaction, key=lambda x: index[x], reverse=True)
if len(filtered_transaction) > 0:
update_tree(filtered_transaction, root, header_table, count)
return root, header_table
def update_tree(items, node, header_table, count):
"""
Param:
items:建树的序列
node:当前节点
head_table: 头节点转移表
count:当前节点出现的次数
Procedure:
1. 如果当前节点的子节点集合中没有items[0],就创立子节点,跟新头节点表格
2. 如果有就加上count
3. 递归跟新所有元素
"""
# 更新FP-Tree
if items[0] in node.children:
node.children[items[0]].count += count
else:
new_node = TreeNode(items[0], count, node)
node.children[items[0]] = new_node
if header_table[items[0]][1] is None:
header_table[items[0]][1] = new_node
else:
update_header(header_table[items[0]][1], new_node)
# 递归更新剩余元素项
if len(items) > 1:
update_tree(items[1:], node.children[items[0]], header_table, count)
def update_header(node_to_test, target_node):
"""
Param:
node_to_test:当前
target_node:需要在链表里加的目的节点
Procedure:
1. 递归寻找链表的末尾
2. 在末尾加上目的节点
"""
# 更新header_table中相同元素项的链表指针
while node_to_test.node_link is not None:
node_to_test = node_to_test.node_link
node_to_test.node_link = target_node
def ascend_tree(node, prefix_path):
"""
Param:
node:当前
prefix_path:存储的路径
Procedure:
1. 递归向上寻找root节点
2. 记录沿路的节点
"""
# 从叶子节点向上追溯,得到条件模式基
if node.parent is not None:
prefix_path.append(node.item)
ascend_tree(node.parent, prefix_path)
def find_prefix_paths(base_path, header_table):
"""
Param:
base_path:当前叶子节点
header_table:头节点表
Procedure:
1. 根据头节点表存的节点,遍历链表的每一个元素
2. 对每一个元素求其前缀路径
return:
conditional_patterns:条件模式基
"""
# 从header_table中得到条件模式基
conditional_patterns = {}
node = header_table[base_path][1]
while node is not None:
prefix_path = []
ascend_tree(node, prefix_path)
if len(prefix_path) > 1:
conditional_patterns[frozenset(prefix_path[1:])] = node.count
node = node.node_link
return conditional_patterns
def mine_fp_tree(header_table, min_support, prefix, frequent_itemsets):
"""
Param:
header_table:头节点表
min_support:最小支持度
prefix:存储的前缀频繁项集
frequent_itemsets:频繁项集
Procedure:
1. 从支持度最小的一项频繁集开始,对每一个频繁集递归建树,
2. 先找到所有条件模式基,将条件模式基作为数据建立一棵新树
"""
# 递归挖掘FP-Tree得到频繁项集
sorted_items = [item[0] for item in header_table.items()]
for item in sorted_items:
new_prefix = prefix.copy()
new_prefix.add(item)
frequent_itemsets.append(new_prefix)
conditional_patterns = find_prefix_paths(item, header_table)
conditional_tree, conditional_header = create_tree(conditional_patterns, min_support)
if conditional_header is not None:
mine_fp_tree(conditional_header, min_support, new_prefix, frequent_itemsets)
def fp_growth(data, min_support):
"""
Param:
data: 数据
min_support:最小支持度
Procedure:
1. 先建树
2. 后挖掘
"""
# FP-Growth算法入口
root, header_table = create_tree(data, min_support)
if root is None:
return []
frequent_itemsets = []
mine_fp_tree(header_table, min_support, set(), frequent_itemsets)
return frequent_itemsets
# 测试代码
data = {}
dataLength = 0
with open("C://Users//86180//Desktop//数据挖掘//retail.dat") as f:
line = f.readline()
while line:
dataLength += 1
transaction = frozenset(map(int, line.split()))
if data.get(transaction) is None:
data[transaction] = 1
else:
data[transaction] += 1
line = f.readline()
print('数据长度',dataLength)
min_support = 0.0005 * dataLength
print('最小支持度', min_support)
s = time.time()
frequent_itemsets = fp_growth(data, min_support)
e = time.time()
print('计算时间', e - s, 's')
print("Frequent Itemsets:", len(frequent_itemsets))
# 输出展示
freq_set = {}
for item in frequent_itemsets:
if len(item) not in freq_set:
freq_set[len(item)] = []
freq_set[len(item)].append(item)
i = 1
for key in freq_set:
print('第%d项集' %i, len(freq_set[key]), )
i += 1
Fpgrowth算法,代码,详细注释
于 2024-04-27 19:53:25 首次发布