- 空间聚类是基于一定的相似性度量对空间大数据集进行分组的过程。空间聚类分析是一种无监督形式的机器学习。空间聚类应用广泛,如地理信息系统、生态环境、军事、市场分析等领域。通过空间聚类可以从空间数据集中发现隐含的信息或知识,包括空间实体聚集趋势,分布规律和发展变化趋势等。聚类方法可以大致分为:基于层次的聚类、基于划分的聚类、基于密度的聚类、基于网格的聚类、基于模型的聚类。
- DBSCAN(Density-Based Spatial Clustering of Application with Noise)算法是基于密度的聚类方法的一种经典算法,于1996年由M Ester等人[1]提出。DBSCAN由两个参数控制,分别为MinPts和Epsilon。
- ST-DBSCAN(Spatial Temporal-DBSCAN)算法由DeryaBirant等人[2]在2007年提出用于海洋环境研究。ST-DBSCAN是在DBSCAN基础上发展而来,相比DBSCAN多了一个维度上的聚类。需要注意的是,多的一个维度上的约束条件不一定得是时间距离,可以是与二维空间其它无相关性的维度,例如高程、颜色、温度、质量等。ST-DBSCAN算法如下(截取自[2]原文):
算法示意如下图,左为DBSCAN,右为ST-DBSCAN
以某地共享单车数据前100条为例实现 ST-DBSCAN聚类,ST-DBSCAB参数设置为:
spatial_threshold = 500 # meters
temporal_threshold = 60 # minutes
min_neighbors = 3 # points
结果如下:
running in python3.8:
main.py
import pandas as pd
from sys import argv
import STDBSCAN
import numpy as np
csv_path = argv[0]
# df_table must have the columns: 'latitude', 'longitude' and 'date_time'
a = pd.read_csv('TestData.csv')
aa = a.head(n=100)
df_table = aa[['sn', 'O-Time', 'D-lat', 'O-lng']]
df_table.columns = ['id', 'date_time', 'latitude', 'longitude']
df_table['date_time'] = pd.to_timedelta(df_table['date_time'].astype(str))
spatial_threshold = 500 # meters
temporal_threshold = 60 # minutes
min_neighbors = 3 # points
df_clustering = STDBSCAN.ST_DBSCAN(df_table, spatial_threshold, temporal_threshold, min_neighbors)
print(df_clustering)
def plot_clusters(df):
import matplotlib.pyplot as plt
labels = df['cluster'].values
X = df[['longitude', 'latitude']].values
# Black removed and is used for noise instead.
unique_labels = set(labels)
colors = [plt.cm.Spectral(each)
for each in np.linspace(0, 1, len(unique_labels))]
for k, col in zip(unique_labels, colors):
if k == -1:
# Black used for noise
col = [0, 0, 0, 1]
class_member_mask = (labels == k)
xy = X[class_member_mask]
plt.plot(xy[:, 0], xy[:, 1], 'o', markerfacecolor=tuple(col),
markeredgecolor='k', markersize=6)
plt.title('ST-DBSCAN: #n of clusters {}'.format(len(unique_labels)))
plt.xlabel('longitude(E)')
plt.ylabel('latitude(N)')
plt.show()
print(pd.value_counts(df_clustering['cluster']))
plot_clusters(df_table)
STDBSCAN.py
from datetime import timedelta
from geopy.distance import great_circle
"""
INPUTS:
df={o1,o2,...,on} Set of objects
spatial_threshold = Maximum geographical coordinate (spatial) distance value
temporal_threshold = Maximum non-spatial distance value
min_neighbors = Minimun number of points within Eps1 and Eps2 distance
OUTPUT:
C = {c1,c2,...,ck} Set of clusters
"""
def ST_DBSCAN(df, spatial_threshold, temporal_threshold, min_neighbors):
cluster_label = 0
NOISE = -1
UNMARKED = 777777
stack = []
# initialize each point with unmarked
df['cluster'] = UNMARKED
# for each point in database
for index, point in df.iterrows():
if df.loc[index]['cluster'] == UNMARKED:
neighborhood = retrieve_neighbors(index, df, spatial_threshold, temporal_threshold)
if len(neighborhood) < min_neighbors:
df.at[index, 'cluster'] = NOISE
else: # found a core point
cluster_label = cluster_label + 1
df.at[index, 'cluster'] = cluster_label# assign a label to core point
for neig_index in neighborhood: # assign core's label to its neighborhood
df.at[neig_index, 'cluster'] = cluster_label
stack.append(neig_index) # append neighborhood to stack
while len(stack) > 0: # find new neighbors from core point neighborhood
current_point_index = stack.pop()
new_neighborhood = retrieve_neighbors(current_point_index, df, spatial_threshold, temporal_threshold)
if len(new_neighborhood) >= min_neighbors: # current_point is a new core
for neig_index in new_neighborhood:
neig_cluster = df.loc[neig_index]['cluster']
if (neig_cluster != NOISE) & (neig_cluster == UNMARKED):
# TODO: verify cluster average before add new point
df.at[neig_index, 'cluster'] = cluster_label
stack.append(neig_index)
return df
def retrieve_neighbors(index_center, df, spatial_threshold, temporal_threshold):
neigborhood = []
center_point = df.loc[index_center]
# filter by time
min_time = center_point['date_time'] - timedelta(minutes = temporal_threshold)
max_time = center_point['date_time'] + timedelta(minutes = temporal_threshold)
df = df[(df['date_time'] >= min_time) & (df['date_time'] <= max_time)]
# filter by distance
for index, point in df.iterrows():
if index != index_center:
distance = great_circle((center_point['latitude'], center_point['longitude']), (point['latitude'], point['longitude'])).meters
if distance <= spatial_threshold:
neigborhood.append(index)
return neigborhood
- *测试数据
参考文献
[1] Ester M, Kriegel H P, Sander J, et al. A density-based algorithm for discovering clusters in large spatial databases with noise[C]//Kdd. 1996, 96(34): 226-231.
[2] Birant D, Kut A. ST-DBSCAN: An algorithm for clustering spatial–temporal data[J]. Data & knowledge engineering, 2007, 60(1): 208-221.