根据经纬度距离,将站点分组。 利用sklearn.DBSCAN的密度算法实现快速分组。
假设已有以下数据:
import pandas as pd
from pandas.api.types import is_numeric_dtype
from sklearn.cluster import DBSCAN
from math import *
#自定义距离计算方法
def calculate_distance(lon1, lat1, lon2, lat2):
lon1, lat1, lon2, lat2 = map(radians, [float(lon1), float(lat1), float(lon2), float(lat2)])
d_lon, d_lat = lon2 - lon1, lat2 - lat1
am = sin(d_lat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(d_lon / 2) ** 2
distance = 2 * asin(sqrt(am)) * 6371229
return float('%.4f' % distance)
def run_dbscan():
df = pd.read_excel(path_in, sheet_name='点位分组')
if is_numeric_dtype(df['经度']) & is_numeric_dtype(df['纬度']): # 是否为数字
if df['经度'].isnull().sum() <= 0 and df['纬度'].isnull().sum() <= 0: # 是否不为空
data = df[['经度', '纬度']]
# eps指距离范围,min_samples指多少个站点才能进行分组
# 算法采用kd_tree,度量标准用闵氏距离minkowski,当p=1为曼哈顿,p=2为标准欧几里德,p=∞为切比雪
db = DBSCAN(eps=eps_, min_samples=min_n, metric=lambda s1, s2: calculate_distance(*s1, *s2)).fit(data)
data['cluster'] = db.labels_
data = pd.DataFrame(data)
merge_excel = pd.merge(df, data, left_index=True, right_index=True, how='left')
merge_excel.to_excel(r'D:\xxx经纬度信息表-结果-分组.xlsx')
print(merge_excel)
else:
print('ERROR:数据有空值')
else:
print('ERROR:数据有异常')
if __name__ == '__main__':
path_in = r'D:\经纬度信息表.xlsx'
eps_ = 100 # 半径
min_n = 1 # 数量
run_dbscan()