算法介绍参照此文章
核心思路:
- 首选任意选取一个点,然后找到到这个点距离小于等于 eps 的所有的点。如果距起始点的距离在 eps 之内的数据点个数小于 min_samples,那么这个点被标记为噪声。如果距离在 eps 之内的数据点个数大于 min_samples,则这个点被标记为核心样本,并被分配一个新的簇标签。
- 然后访问该点的所有邻居(在距离 eps 以内)。如果它们还没有被分配一个簇,那么就将刚刚创建的新的簇标签分配给它们。如果它们是核心样本,那么就依次访问其邻居,以此类推。簇逐渐增大,直到在簇的 eps 距离内没有更多的核心样本为止。
- 选取另一个尚未被访问过的点,并重复相同的过程。
算法实现
"""
AUTHOR: chenyi
DATE: 2021-11-13
Density-Based Spatial Clustering of Applications with Noise
In order to visualize the result in a 2d dimension,
let's assume that the dimension af the points would only be 2.
Core Params:
- eps: the radius of the cluster.
- minDots: the min number of the points the cluster should have to be a cluster.
Point {
visited -> bool: indicate whether a point is visited.
coords -> (float, float): describe a point's coordination.
cluster -> int: indicate to which cluster the point belongs.
}
"""
import random
from math import sqrt
import matplotlib.pyplot as plt
import numpy as np
class Point:
"""DEFINITION"""
def __init__(self):
self.visited = False
self.coords = (random.random() * 10, random.random() * 10)
self.cluster = None
def __int__(self, coords):
self.visited = False
self.coords = coords
self.cluster = None
def cal_distance(self, other):
d = sqrt((other.coords[0] - self.coords[0]) ** 2 + (other.coords[1] - self.coords[1]) ** 2)
return d
def set_cluster(self, id):
self.visited = True
self.cluster = id
def to_string(self):
return "cluster:{}\tvisited: {}\tcoords:{}".format(self.cluster, self.visited, self.coords)
class DBSCAN:
"""DEFINITION"""
def __init__(self, n, eps, minPoints):
self.points = [Point() for i in range(n)]
self.eps = eps
self.minPoints = minPoints
self.cluster = 0 # save the last cluster id
def is_core(self, point: Point):
""" determine whether a point is the core of the cluster """
dots = 0
points = self.points.copy()
points.remove(point)
for ele in points:
d = ele.cal_distance(point)
dots += d < self.eps
return dots >= self.minPoints
def pick_unvisited_point(self):
""" randomly pick an unvisited point """
unvisited = list(filter(lambda x: not x.visited, self.points.copy()))
return random.choice(unvisited)
def is_all_visited(self):
""" check whether all points were visited """
for ele in self.points:
if not ele.visited:
return False
return True
def find_all_adjacent_points(self, point: Point):
points = self.points.copy()
points.remove(point)
adjacent_points = []
for ele in points:
if point.cal_distance(ele) < self.eps:
adjacent_points.append(ele)
return adjacent_points
def DFS(self, core: Point):
""" """
# break condition:
# if core.visited or self.is_all_visited():
# return
adjacent_points = self.find_all_adjacent_points(core)
for ele in adjacent_points:
if ele.visited:
continue
ele.set_cluster(self.cluster)
if self.is_core(ele):
self.DFS(ele)
def print_result(self):
for ele in self.points:
print(ele.to_string())
def draw(self):
fig = plt.figure()
ax = fig.add_subplot(111)
plt.xlim(0, 10)
plt.ylim(0, 10)
for i in range(0, self.cluster):
points = list(filter(lambda e: e.cluster == i, self.points))
coords = list(map(lambda e: e.coords, points))
x = [coord[0] for coord in coords]
y = [coord[1] for coord in coords]
plt.scatter(x, y)
for j in range(len(x)):
circ = plt.Circle((x[j], y[j]), self.eps, fill=False) # center, radius
ax.add_patch(circ)
# draw noise
points = list(filter(lambda e: e.cluster == -1, self.points))
coords = list(map(lambda e: e.coords, points))
x = [coord[0] for coord in coords]
y = [coord[1] for coord in coords]
plt.scatter(x, y, c='r', marker='x')
plt.show()
def main():
model = DBSCAN(10, 2, 1)
while not model.is_all_visited():
point = model.pick_unvisited_point()
if model.is_core(point):
model.DFS(point)
model.cluster += 1
else:
point.set_cluster(-1) # -1 means the point belongs to noise.
model.print_result()
model.draw()
if __name__ == '__main__':
main()
N=10 eps=2 minDots=1的运行结果
N=20 dim =2 num=2的运行结果