做用户聚类,有一个feature涉及到面积计算,温故一下之前了解到的凸包算法的流程,实际上,这篇文章基于的原文章,达到的精度,满足我的需求。具体实现上代码参考,因为坐标系不是笛卡尔坐标系
先上笔者参考已有的实现稍作修改后作为静态工具类使用的代码
# the class that seals the algorithm that uses convex hull to calculate the points-distributed-area
from math import *
class cal_area(object):
@staticmethod
def deg2rad(deg):
return deg * (pi / 180)
@staticmethod
def cross(A, B):
return A[0] * B[1] - A[1] * B[0]
@staticmethod
def vectorMinus(a, b):
return ((a[0] - b[0]) * 1000, (a[1] - b[1]) * 1000)
@staticmethod
def getLTDis(A, B):
lon1, lat1, lon2, lat2 = map(cal_area.deg2rad, [A[0], A[1], B[0], B[1]])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
c = 2 * asin(sqrt(a))
r = 6371.393
# print A,B
return c * r * 1.0 # 1000.0
@staticmethod
def triangleAre(A, B, C):
x, y, z = cal_area.getLTDis(A, B), cal_area.getLTDis(B, C), cal_area.getLTDis(C, A)
c = (x + y + z) / 2
return sqrt((c) * (c - y) * (c - z) * (c - x))
@staticmethod
def grahamScanArea(data):
data.sort(key=lambda x: (x[0], x[1]), reverse=False)
ans = [0] * (len(data) * 2)
m = 0
for item in data:
top = len(item)
while (m > 1 and cal_area.cross(cal_area.vectorMinus(ans[m - 1], ans[m - 2]),
cal_area.vectorMinus(item, ans[m - 2])) <= 0): m = m - 1
ans[m] = item
m = m + 1
k = m
flag = True
data.reverse()
for item in data:
if flag:
flag = False
continue
while (m > k and cal_area.cross(cal_area.vectorMinus(ans[m - 1], ans[m - 2]),
cal_area.vectorMinus(item, ans[m - 2])) <= 0): m = m - 1
ans[m] = item
m = m + 1
m = m - 1
b = [ans[i] for i in range(0, m)]
if len(b) < 3: return 0
# if DEBUG : print b
return cal_area.AREA(b)
@staticmethod
def AREA(b):
ans = 0.0
for i in range(len(b)):
if i == 0 or i + 1 >= len(b): continue
x, y = b[i], b[i + 1]
ans += cal_area.triangleAre(b[0], x, y)
return ans
## 使用方式
# calculate coverage of the business that customer has been to
# form the user_points table
user_visited_area=[]
for user_id in user_business.keys():
points=[]
visited_business=user_business[user_id]
for business_id in visited_business:
business_gps=business[business_id]
if business_gps['longitude']==None or business_gps['latitude']==None: continue
points.append([float(business_gps['longitude']),float(business_gps['latitude'])])
visited_area=cal_area.grahamScanArea(points)
user_visited_area.append(visited_area)
对于一个点集P来讲,它的凸包就是一个凸多边形Q,其中满足P中的每个点都在Q的边界上或内部。就像下图所示
凸包的计算算法有好多种,wiki和算法导论第33章中都有比较详细的介绍,比如下面是算法导论中给出的Graham-Scan算法计算凸包的伪代码。
现在网上已经有了好多计算点集凸包的优秀代码,比如这篇文章,作者在文中使用了一个动画来表示了Graham-Scan算法计算凸包的过程,并给出了python程序的实现,十分有助于学习者对算法的理解。最近有个东西需要使用到凸包,本着“不要重复造轮子”的原则,我在开始的时候直接使用了作者文中的程序。开始使用的时候没有发现什么问题,比如下图所示的效果。
但是,当我在使用其他的数据进行测试的时候,发现程序执行的效果并不太好,下图是使用著名的Iris数据集的前两个维度测试的效果,这明显不是一个凸包。
没有找到原因,于是开始自己写程序,发现结果也是这样的。后来将执行过程中栈的状态打印输出,才发现我们两个的程序中都存在一个小问题。当点集中出现两个完全相同的点的时候,算法就是失效了。上面伪代码中序号8所示的while循环的判断条件不应该是两个向量的叉乘大于0, 而应该是大于等于0。否则的话,对于点集中出现连续两个完全相同的点的时候,栈只能弹出其中一个点,而另一个点会仍然在栈中。所以会出现上图所示的结果。修改之后的结果如下图所示,这才是一个正确的凸包。
具体的Python实现代码如下
import matplotlib.pyplot as plt
import math
import sklearn.datasets as datasets
"""
使用Graham扫描法计算凸包
网上的代码好多运行效果并不好
算法参见《算法导论》第三版 第605页
"""
def get_bottom_point(points):
"""
返回points中纵坐标最小的点的索引,如果有多个纵坐标最小的点则返回其中横坐标最小的那个
:param points:
:return:
"""
min_index = 0
n = len(points)
for i in range(0, n):
if points[i][1] < points[min_index][1] or (points[i][1] == points[min_index][1] and points[i][0] < points[min_index][0]):
min_index = i
return min_index
def sort_polar_angle_cos(points, center_point):
"""
按照与中心点的极角进行排序,使用的是余弦的方法
:param points: 需要排序的点
:param center_point: 中心点
:return:
"""
n = len(points)
cos_value = []
rank = []
norm_list = []
for i in range(0, n):
point_ = points[i]
point = [point_[0]-center_point[0], point_[1]-center_point[1]]
rank.append(i)
norm_value = math.sqrt(point[0]*point[0] + point[1]*point[1])
norm_list.append(norm_value)
if norm_value == 0:
cos_value.append(1)
else:
cos_value.append(point[0] / norm_value)
for i in range(0, n-1):
index = i + 1
while index > 0:
if cos_value[index] > cos_value[index-1] or (cos_value[index] == cos_value[index-1] and norm_list[index] > norm_list[index-1]):
temp = cos_value[index]
temp_rank = rank[index]
temp_norm = norm_list[index]
cos_value[index] = cos_value[index-1]
rank[index] = rank[index-1]
norm_list[index] = norm_list[index-1]
cos_value[index-1] = temp
rank[index-1] = temp_rank
norm_list[index-1] = temp_norm
index = index-1
else:
break
sorted_points = []
for i in rank:
sorted_points.append(points[i])
return sorted_points
def vector_angle(vector):
"""
返回一个向量与向量 [1, 0]之间的夹角, 这个夹角是指从[1, 0]沿逆时针方向旋转多少度能到达这个向量
:param vector:
:return:
"""
norm_ = math.sqrt(vector[0]*vector[0] + vector[1]*vector[1])
if norm_ == 0:
return 0
angle = math.acos(vector[0]/norm_)
if vector[1] >= 0:
return angle
else:
return 2*math.pi - angle
def coss_multi(v1, v2):
"""
计算两个向量的叉乘
:param v1:
:param v2:
:return:
"""
return v1[0]*v2[1] - v1[1]*v2[0]
def graham_scan(points):
# print("Graham扫描法计算凸包")
bottom_index = get_bottom_point(points)
bottom_point = points.pop(bottom_index)
sorted_points = sort_polar_angle_cos(points, bottom_point)
m = len(sorted_points)
if m < 2:
print("点的数量过少,无法构成凸包")
return
stack = []
stack.append(bottom_point)
stack.append(sorted_points[0])
stack.append(sorted_points[1])
for i in range(2, m):
length = len(stack)
top = stack[length-1]
next_top = stack[length-2]
v1 = [sorted_points[i][0]-next_top[0], sorted_points[i][1]-next_top[1]]
v2 = [top[0]-next_top[0], top[1]-next_top[1]]
while coss_multi(v1, v2) >= 0:
stack.pop()
length = len(stack)
top = stack[length-1]
next_top = stack[length-2]
v1 = [sorted_points[i][0] - next_top[0], sorted_points[i][1] - next_top[1]]
v2 = [top[0] - next_top[0], top[1] - next_top[1]]
stack.append(sorted_points[i])
return stack
def test1():
points = [[1.1, 3.6],
[2.1, 5.4],
[2.5, 1.8],
[3.3, 3.98],
[4.8, 6.2],
[4.3, 4.1],
[4.2, 2.4],
[5.9, 3.5],
[6.2, 5.3],
[6.1, 2.56],
[7.4, 3.7],
[7.1, 4.3],
[7, 4.1]]
for point in points:
plt.scatter(point[0], point[1], marker='o', c='y')
result = graham_scan(points)
length = len(result)
for i in range(0, length-1):
plt.plot([result[i][0], result[i+1][0]], [result[i][1], result[i+1][1]], c='r')
plt.plot([result[0][0], result[length-1][0]], [result[0][1], result[length-1][1]], c='r')
plt.show()
def test2():
"""
使用复杂一些的数据测试程序运行效果
:return:
"""
iris = datasets.load_iris()
data = iris.data
points_ = data[:, 0:2]
points__ = points_[0:50, :]
points = points__.tolist()
temp_index = 0
for point in points:
plt.scatter(point[0], point[1], marker='o', c='y')
index_str = str(temp_index)
plt.annotate(index_str, (point[0], point[1]))
temp_index = temp_index + 1
result = graham_scan(points)
print(result)
length = len(result)
for i in range(0, length-1):
plt.plot([result[i][0], result[i+1][0]], [result[i][1], result[i+1][1]], c='r')
plt.plot([result[0][0], result[length-1][0]], [result[0][1], result[length-1][1]], c='r')
# for i in range(0, len(rank)):
plt.show()
if __name__ == "__main__":
test2()
经纬度坐标下利用凸包,近似计算面积的代码参考
def cross(A,B):
return A[0] * B[1] - A[1] * B[0]
def vectorMinus( a , b):
return ( (a[0] - b[0] )*1000,(a[1] - b[1] )*1000)
def getLTDis( A, B ):
lon1, lat1, lon2, lat2 = map(radians, [A[0], A[1], B[0], B[1]])
dlon = lon2 - lon1
dlat = lat2 - lat1
a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
c = 2 * asin(sqrt(a))
r = 6371.393
#print A,B
return c * r * 1000.0
def triangleAre(A,B,C):
x,y,z = getLTDis(A,B),getLTDis(B,C),getLTDis(C,A)
c = (x + y + z) /2
return sqrt((c)*(c-y)*(c-z)*(c-x))
def grahamScanArea(data):
data.sort(key=lambda x:(x[0],x[1]),reverse=False)
ans = [ 0 ] * (len(data)*2)
m = 0
for item in data:
top = len(item)
while( m > 1 and cross( vectorMinus(ans[ m -1 ] , ans [ m - 2 ]), vectorMinus( item , ans [ m - 2 ] )) <= 0 ) : m = m -1
ans[m] = item
m = m + 1
k = m
flag = True
data.reverse()
for item in data:
if flag :
flag = False
continue
while( m > k and cross( vectorMinus(ans[ m -1 ] , ans [ m - 2 ]), vectorMinus( item , ans [ m - 2 ] )) <= 0) : m = m - 1
ans [m] = item
m = m + 1
m = m -1
b = [ ans[i] for i in range(0, m)]
if len(b) < 3 : return 0
#if DEBUG : print b
return AREA(b)
def AREA(b):
ans = 0.0
for i in range(len(b)):
if i == 0 or i + 1 >= len(b) : continue
x , y = b[i] , b[i + 1]
ans += triangleAre( b[0] , x , y )
return ans