前段时间需要对一些客服对话记录做聚类分析,于是抽时间测试了一下常见聚类算法的效果。之前了解过的聚类算法大多在sklearn中都有现成的实现可以直接用,不过optics算法倒没找到,于是就看着论文做了个简易版的。下面是算法源码,关于原理请参考原始论文:
- C. Ding, X. He, and H. D. Simon, “On the Equivalence of Nonnegative Matrix Factorization and Spectral Clustering,” in Proceedings of the 2005 SIAM International Conference on Data Mining, H. Kargupta, J. Srivastava, C. Kamath, and A. Goodman, Eds. Philadelphia, PA: Society for Industrial and Applied Mathematics, 2005, pp. 606–610.
verbose.py:辅助调试用
1 class Verbose: 2 def __init__(self, verbose): 3 self.set_printer(verbose) 4 5 def set_printer(self, verbose): 6 if verbose: 7 self.printer = print 8 else: 9 self.printer = lambda x: None
tree.py:定义了二叉树的基本操作
1 # -*- coding: utf-8 -*- 2 3 class Node: 4 def __init__(self, data=None, left=None, right=None): 5 self.data = data 6 self.left = left 7 self.right = right 8 9 @property 10 def is_leaf(self): 11 """如果没有左右子节点,就是叶子节点 12 13 :returns: 14 :rtype: 15 16 """ 17 18 return (not self.left) and (not self.right) 19 20 def preorder(self): 21 """先序遍历递归版本 22 遍历顺序为root->left->right 23 :returns: 24 :rtype: 25 26 """ 27 28 if not self: 29 return 30 31 yield self 32 33 if self.left: 34 for x in self.left.preorder(): 35 yield x 36 37 if self.right: 38 for x in self.right.preorder(): 39 yield x 40 41 def preorder_norecur(self): 42 """先序遍历非递归版本 43 遍历顺序为root->left->right 44 :returns: 45 :rtype: 46 47 """ 48 49 if not self: 50 return 51 stack = [self] 52 while stack: 53 node = stack.pop() 54 yield node 55 ## 后入先出 56 if node.right: 57 stack.append(node.right) 58 if node.left: 59 stack.append(node.left) 60 61 def inorder(self): 62 """中序遍历递归版本 63 遍历顺序为left->root->right 64 :returns: 65 :rtype: 66 67 """ 68 if not self: 69 return 70 71 if self.left: 72 for x in self.left.inorder(): 73 yield x 74 75 yield self 76 77 if self.right: 78 for x in self.right.inorder(): 79 yield x 80 81 def inorder_norecur(self): 82 """中序遍历非递归版本 83 遍历顺序为left->root->right 84 85 :returns: 86 :rtype: 87 中序遍历的思路是先一直找到最左子节点,把沿途所有节点都入栈, 88 然后开始出栈,出栈之后把当前节点设置为上一个节点的右子节点, 89 进行右子树的遍历(如果右子树为空显然就免于遍历了) 90 91 """ 92 93 if not self: 94 return 95 96 stack = [] 97 98 node = self 99 100 while node is not None or len(stack) > 0: 101 if node is not None: 102 stack.append(node) 103 node = node.left 104 else: 105 # 如果node是叶子节点,那么node.right==None,下次会继续弹出node的父节点 106 # 如果node不是叶子节点,且node.right非空,那么下次会执行入栈操作 107 node = stack.pop() 108 yield node 109 node = node.right 110 111 def postorder(self): 112 """后序遍历递归版本 113 遍历顺序为left->right->root 114 115 :returns: 116 :rtype: 117 118 """ 119 120 if not self: 121 return 122 123 if self.left: 124 for x in self.left.postorder(): 125 yield x 126 127 if self.right: 128 for x in self.right.postorder(): 129 yield x 130 131 yield self 132 133 def postorder_norecur(self): 134 """后序遍历非递归版本 135 遍历顺序为left->right->root 136 和中序遍历不同的是,只有下面两种情况之一才能出栈: 137 1. 栈顶元素为叶子节点,此时肯定可以出栈,否则没有节点可以入栈了 138 2. 栈顶元素不是叶子节点,但是上一个出栈的元素是栈顶元素的右子节点 139 上一个出栈的元素是栈顶元素的右子节点说明节点的右子数已经遍历过了, 140 所以现在当前节点可以出栈了 141 142 :returns: 143 :rtype: 144 145 """ 146 147 if not self: 148 return 149 150 stack = [] 151 152 node = self 153 last_node = None 154 while node is not None or len(stack) > 0: 155 if node is not None: 156 stack.append(node) 157 node = node.left 158 else: 159 # 这里不会越界,因为能进到这里的前提条件是node is None 160 # 这时必然有stack非空,否则while循环就退出了 161 temp = stack[-1] 162 if temp.is_leaf or temp.right is last_node: 163 node = stack.pop() 164 last_node = node 165 yield node 166 # 这里node要设置为None,因为该节点及左右子树都已遍历,需要向上回溯了 167 # 注意中序遍历时这里设置的是node=node.right,因为右子树实在父节点 168 # 遍历后才遍历的 169 node = None 170 else: 171 node = temp.right 172 173 def breadth_frist(self): 174 """广度优先遍历 175 和深度优先遍历(前/中/后序遍历)的区别是使用队列而不是栈. 176 :returns: 177 :rtype: 178 179 """ 180 181 if not self: 182 return 183 184 queue = [self] 185 while queue: 186 node = queue.pop(0) # 弹出队列首个元素,若直接调用list.pop()则弹出末尾元素 187 yield node 188 if node.left: 189 queue.append(node.left) 190 if node.right: 191 queue.append(node.right) 192 193 194 @property 195 def children(self): 196 """ 197 Returns an iterator for the non-empty children of the Node 198 199 The children are returned as (Node, pos) tuples where pos is 0 for the 200 left subnode and 1 for the right. 201 202 >>> len(list(create(dimensions=2).children)) 203 0 204 205 >>> len(list(create([ (1, 2) ]).children)) 206 0 207 208 >>> len(list(create([ (2, 2), (2, 1), (2, 3) ]).children)) 209 2 210 """ 211 212 if self.left and self.left.data is not None: 213 yield self.left, 0 214 if self.right and self.right.data is not None: 215 yield self.right, 1 216 217 def set_child(self, index, child): 218 """ Sets one of the node's children 219 220 index 0 refers to the left, 1 to the right child """ 221 222 if index == 0: 223 self.left = child 224 else: 225 self.right = child 226 227 def height(self): 228 229 min_height = int(bool(self)) 230 return max([min_height] + [c.height() + 1 for c, p in self.children]) 231 232 def __repr__(self): 233 return '<%(cls)s - %(data)s>' % \ 234 dict(cls=self.__class__.__name__, data=repr(self.data)) 235 236 # def __nonzero__(self): 237 # return self.data is not None 238 239 # __bool__ = __nonzero__ 240 241 # def __eq__(self, other): 242 # if isinstance(other, tuple): 243 # return self.data == other 244 # else: 245 # return self.data == other.data 246 247 def __hash__(self): 248 return id(self)
kd_tree.py:KD树
1 # -*- coding: utf-8 -*- 2 3 import numpy as np 4 from collections import deque 5 from scipy.spatial.distance import euclidean 6 from tree import Node 7 8 9 class KDNode(Node): 10 def __init__