实验题目:
某医院为了研究几种内科疾病的关联,随机抽取了七十名病人的档案,得到的情况记录在 数据集.txt 中,其中一部分数据如下图所示:
Cardiacfailure表示心力衰竭,Myocardialinfarction表示心肌梗塞,uremia表示尿毒症,diabetes表示糖尿病,Renalfailure 表示肾衰竭,Other表示其他疾病
要求:请自行根据数据的实际情况指定最小支持度与最小置信度,并采用FP增长算法挖掘这些疾病之间的关联规则。
一、实验代码
import pprint
def loadDataSet():
f2 = open("数据集.txt", "r")
# f2 = open("data.txt", "r")
lines = f2.readlines()
retData = []
for line in lines:
items = line.strip().split(' ')
retData.append([(items[i]) for i in range(1, len(items))])
return retData
def transfer2FrozenDataSet(dataSet):
frozenDataSet = {}
for elem in dataSet:
frozenDataSet[frozenset(elem)] = 1
return frozenDataSet
class TreeNode:
def __init__(self, nodeName, count, nodeParent):
self.nodeName = nodeName
self.count = count
self.nodeParent = nodeParent
self.nextSimilarItem = None
self.children = {}
def increaseC(self, count):
self.count += count
def disp(self, ind=1):
# 将树以文本形式展示
print(' ' * ind, self.nodeName, ' ', self.count)
for child in self.children.values():
child.disp(ind + 1)
def createFPTree(frozenDataSet, minSupport):
#第一次扫描数据集,筛选出小于支持的项
headPointTable = {}
for items in frozenDataSet:
for item in items:
headPointTable[item] = headPointTable.get(item, 0) + frozenDataSet[items]
headPointTable = {
k: v
for k, v in headPointTable.items() if v >= minSupport
}
frequentItems = set(headPointTable.keys())
if len(frequentItems) == 0: return None, None
for k in headPointTable:
headPointTable[k] = [headPointTable[k], None]
fptree = TreeNode("null", 1, None)
#第二次扫描数据集,筛选出每个记录的项
for items, count in frozenDataSet.items():
frequentItemsInRecord = {}
for item in items:
if item in frequentItems:
frequentItemsInRecord[item] = headPointTable [item][0]
if len(frequentItemsInRecord) > 0:
frequentItemsInRecord = sorted(frequentItemsInRecord. items(), key=lambda v: v[0])
orderedFrequentItems = [v[0] for v in sorted(frequentItemsInRecord,key=lambda v:v[1], reverse=True)]
updateFPTree(fptree, orderedFrequentItems, headPointTable, count)
return fptree, headPointTable
def updateFPTree(fptree, orderedFrequentItems, headPointTable, count):
#处理第一项
if orderedFrequentItems[0] in fptree.children:
fptree.children[orderedFrequentItems[0]].increaseC(count)
else:
fptree.children[orderedFrequentItems[0]] = TreeNode(orderedFrequentItems[0], count, fptree)
# 修改头结点表
if headPointTable[orderedFrequentItems[0]][1] == None:
headPointTable[orderedFrequentItems[0]][1] = fptree.children[orderedFrequentItems[0]]
else: updateHeadPointTable(headPointTable[orderedFrequentItems[0]][1], fptree.children[orderedFrequentItems[0]])
# 处理除第一项外的其他项
if (len(orderedFrequentItems) > 1):
updateFPTree(fptree.children[orderedFrequentItems[0]], orderedFrequentItems[1::], headPointTable, count)
def updateHeadPointTable(headPointBeginNode, targetNode):
while (headPointBeginNode.nextSimilarItem != None):
headPointBeginNode = headPointBeginNode.nextSimilarItem
headPointBeginNode.nextSimilarItem = targetNode
def mineFPTree(headPointTable, prefix, frequentPatterns, minSupport):
#对于头结点表的每一项, 查找条件前缀路径,创建条件FP树
#然后迭代,直到条件FP树中只有一个元素
headPointItems = [v[0] for v in sorted(headPointTable.items(), key=lambda v: v[1][0])]
if (len(headPointItems) == 0): return
for headPointItem in headPointItems:
newPrefix = prefix.copy()
newPrefix.add(headPointItem)
support = headPointTable[headPointItem][0]
frequentPatterns[frozenset(newPrefix)] = support
prefixPath = getPrefixPath(headPointTable, headPointItem)
if (prefixPath != {}):
conditionalFPtree, conditionalHeadPointTable = createFPTree(prefixPath, minSupport)
if conditionalHeadPointTable != None:
mineFPTree(conditionalHeadPointTable, newPrefix, frequentPatterns, minSupport)
def getPrefixPath(headPointTable, headPointItem):
prefixPath = {}
beginNode = headPointTable[headPointItem][1]
prefixs = ascendTree(beginNode)
if ((prefixs != [])):
prefixPath[frozenset(prefixs)] = beginNode.count
while (beginNode.nextSimilarItem != None):
beginNode = beginNode.nextSimilarItem
prefixs = ascendTree(beginNode)
if (prefixs != []):
prefixPath[frozenset(prefixs)] = beginNode.count
return prefixPath
def ascendTree(treeNode):
prefixs = []
while ((treeNode.nodeParent!=None) and (treeNode.nodeParent. nodeName != 'null')):
treeNode = treeNode.nodeParent
prefixs.append(treeNode.nodeName)
return prefixs
def rulesGenerator(frequentPatterns, minConf, rules):
for frequentset in frequentPatterns:
if (len(frequentset) > 1):
getRules(frequentset, frequentset, rules, frequentPatterns,minConf)
def removeStr(set, str):
tempSet = []
for elem in set:
if (elem != str):
tempSet.append(elem)
tempFrozenSet = frozenset(tempSet)
return tempFrozenSet
def getRules(frequentset, currentset, rules, frequentPatterns, minConf):
for frequentElem in currentset:
subSet = removeStr(currentset, frequentElem)
confidence = frequentPatterns[frequentset] / frequentPatterns[subSet]
if (confidence >= minConf):
flag = False
for rule in rules:
if (rule[0] == subSet and rule[1] == frequentset - subSet):
flag = True
if (flag == False):
rules.append((subSet, frequentset - subSet, confidence))
if (len(subSet) >= 2):
getRules(frequentset, subSet, rules, frequentPatterns, minConf)
if __name__ == '__main__':
dataSet = loadDataSet()
frozenDataSet = transfer2FrozenDataSet(dataSet)
min_sup = 3
fptree, headPointTable = createFPTree(frozenDataSet, min_sup)
fptree.disp()
frequentPatterns = {}
prefix = set([])
mineFPTree(headPointTable, prefix, frequentPatterns, min_sup)
print("")
print("频繁模式:")
pprint.pprint(frequentPatterns)
min_conf = 0.7
rules = []
rulesGenerator(frequentPatterns, min_conf, rules)
print("关联规则:")
pprint.pprint(rules)
print('规则总数:', len(rules))
二、实验结果
构建得到的FP-TREE如下:
设置最小支持度和最小置信度:
挖掘得到的频繁模式和关联规则:
三、结果分析
根据以上关联规则可得出以下结论:
最小支持度为3,最小置信度为70%时:
- 糖尿病、尿毒症、肾衰竭三种疾病之间较强的关联关系,心力衰竭、肾衰竭、糖尿病三种疾病之间有很强的关联关系。
- 对于糖尿病和尿毒症的疾病人群而言,有80%的患者会并发肾衰竭。
- 对于尿毒症和肾衰竭的疾病人群而言,也有80%的患者会并发糖尿病。
- 对于肾衰竭和心力衰竭的疾病人群而言,有100%的患者会并发糖尿病。
附 数据集.txt
1 Cardiacfailure Myocardialinfarction Other
2 Cardiacfailure
3 Cardiacfailure uremia Myocardialinfarction
4 Renalfailure Cardiacfailure diabetes uremia
5 uremia Cardiacfailure Renalfailure diabetes
6 diabetes
7 diabetes Cardiacfailure Myocardialinfarction Other
8 diabetes uremia
9 diabetes
10 Renalfailure diabetes uremia
11 diabetes
12 Cardiacfailure diabetes uremia Renalfailure
13 uremia diabetes Renalfailure Cardiacfailure
14 Renalfailure
15 Other Renalfailure
16 Renalfailure diabetes
17 Myocardialinfarction Cardiacfailure
18 uremia Renalfailure
19 Renalfailure
20 uremia diabetes Renalfailure
21 uremia Renalfailure
22 uremia
23 Cardiacfailure uremia Renalfailure diabetes Myocardialinfarction
24 Renalfailure diabetes uremia Cardiacfailure
25 Myocardialinfarction Cardiacfailure Other
26 diabetes Renalfailure uremia Cardiacfailure
27 uremia Renalfailure diabetes Cardiacfailure Myocardialinfarction
28 uremia diabetes Renalfailure Myocardialinfarction
29 diabetes uremia
30 Myocardialinfarction
31 diabetes Renalfailure uremia Cardiacfailure
32 Cardiacfailure diabetes Other
33 Renalfailure diabetes
34 uremia diabetes Renalfailure
35 Myocardialinfarction Cardiacfailure
36 uremia Renalfailure
37 Other Renalfailure Myocardialinfarction
38 Renalfailure diabetes uremia
39 Cardiacfailure Myocardialinfarction Other
40 Myocardialinfarction Other
41 uremia Renalfailure diabetes
42 Cardiacfailure diabetes uremia Renalfailure
43 Myocardialinfarction
44 diabetes uremia Renalfailure
45 Myocardialinfarction Renalfailure
46 Cardiacfailure Myocardialinfarction
47 diabetes
48 Myocardialinfarction Cardiacfailure
49 diabetes Renalfailure uremia
50 Renalfailure
51 diabetes uremia
52 uremia Renalfailure diabetes
53 Other Renalfailure Myocardialinfarction
54 Renalfailure diabetes uremia Cardiacfailure
55 Renalfailure diabetes uremia
56 Myocardialinfarction
57 Renalfailure diabetes
58 Cardiacfailure Myocardialinfarction Other
59 uremia diabetes Renalfailure
60 Cardiacfailure diabetes Other
61 uremia Renalfailure
62 Myocardialinfarction
63 uremia diabetes Renalfailure Myocardialinfarction
64 Renalfailure diabetes Cardiacfailure
65 uremia Renalfailure
66 diabetes
67 Cardiacfailure diabetes Other
68 diabetes Renalfailure uremia Cardiacfailure
69 uremia Renalfailure diabetes
70 Cardiacfailure Myocardialinfarction