python kmeans 基于metric_是否可以使用scikit-learn K-Means聚类指定自己的距离函数？...

最新推荐文章于 2023-04-22 00:34:11 发布

weixin_39701288

最新推荐文章于 2023-04-22 00:34:11 发布

阅读量292

点赞数

文章标签： python kmeans 基于metric

这是一个小型的kmeans，它使用scipy.spatial.distance或用户函数中的20多个距离中的任意一个。

欢迎发表评论(到目前为止，只有一个用户，这还不够)；特别是，您的N，dim，k公制是什么？

#!/usr/bin/env python

# kmeans.py using any of the 20-odd metrics in scipy.spatial.distance

# kmeanssample 2 pass, first sample sqrt(N)

from __future__ import division

import random

import numpy as np

from scipy.spatial.distance import cdist # $scipy/spatial/distance.py

# http://docs.scipy.org/doc/scipy/reference/spatial.html

from scipy.sparse import issparse # $scipy/sparse/csr.py

__date__ = "2011-11-17 Nov denis"

# X sparse, any cdist metric: real app ?

# centres get dense rapidly, metrics in high dim hit distance whiteout

# vs unsupervised / semi-supervised svm

#...............................................................................

def kmeans( X, centres, delta=.001, maxiter=10, metric="euclidean", p=2, verbose=1 ):

""" centres, Xtocentre, distances = kmeans( X, initial centres ... )

in:

X N x dim may be sparse

centres k x dim: initial centres, e.g. random.sample( X, k )

delta: relative error, iterate until the average distance to centres

is within delta of the previous average distance

maxiter

metric: any of the 20-odd in scipy.spatial.distance

"chebyshev" = max, "cityblock" = L1, "minkowski" with p=

or a function( Xvec, centrevec ), e.g. Lqmetric below

p: for minkowski metric -- local mod cdist for 0 < p < 1 too

verbose: 0 silent, 2 prints running distances

out:

centres, k x dim

Xtocentre: each X -> its nearest centre, ints N -> k

distances, N

see also: kmeanssample below, class Kmeans below.

"""

if not issparse(X):

X = np.asanyarray(X) # ?

centres = centres.todense() if issparse(centres) \

else centres.copy()

N, dim = X.shape

k, cdim = centres.shape

if dim != cdim:

raise ValueError( "kmeans: X %s and centres %s must have the same number of columns" % (

X.shape, centres.shape ))

if verbose:

print "kmeans: X %s centres %s delta=%.2g maxiter=%d metric=%s" % (

X.shape, centres.shape, delta, maxiter, metric)

allx = np.arange(N)

prevdist = 0

for jiter in range( 1, maxiter+1 ):

D = cdist_sparse( X, centres, metric=metric, p=p ) # |X| x |centres|

xtoc = D.argmin(axis=1) # X -> nearest centre

distances = D[allx,xtoc]

avdist = distances.mean() # median ?

if verbose >= 2:

print "kmeans: av |X - nearest centre| = %.4g" % avdist

if (1 - delta) * prevdist <= avdist <= prevdist \

or jiter == maxiter:

break

prevdist = avdist

for jc in range(k): # (1 pass in C)

c = np.where( xtoc == jc )[0]

if len(c) > 0:

centres[jc] = X[c].mean( axis=0 )

if verbose:

print "kmeans: %d iterations cluster sizes:" % jiter, np.bincount(xtoc)

if verbose >= 2:

r50 = np.zeros(k)

r90 = np.zeros(k)

for j in range(k):

dist = distances[ xtoc == j ]

if len(dist) > 0:

r50[j], r90[j] = np.percentile( dist, (50, 90) )

print "kmeans: cluster 50 % radius", r50.astype(int)

print "kmeans: cluster 90 % radius", r90.astype(int)

# scale L1 / dim, L2 / sqrt(dim) ?

return centres, xtoc, distances

#...............................................................................

def kmeanssample( X, k, nsample=0, **kwargs ):

""" 2-pass kmeans, fast for large N:

1) kmeans a random sample of nsample ~ sqrt(N) from X

2) full kmeans, starting from those centres

"""

# merge w kmeans ? mttiw

# v large N: sample N^1/2, N^1/2 of that

# seed like sklearn ?

N, dim = X.shape

if nsample == 0:

nsample = max( 2*np.sqrt(N), 10*k )

Xsample = randomsample( X, int(nsample) )

pass1centres = randomsample( X, int(k) )

samplecentres = kmeans( Xsample, pass1centres, **kwargs )[0]

return kmeans( X, samplecentres, **kwargs )

def cdist_sparse( X, Y, **kwargs ):

""" -> |X| x |Y| cdist array, any cdist metric

X or Y may be sparse -- best csr

"""

# todense row at a time, v slow if both v sparse

sxy = 2*issparse(X) + issparse(Y)

if sxy == 0:

return cdist( X, Y, **kwargs )

d = np.empty( (X.shape[0], Y.shape[0]), np.float64 )

if sxy == 2:

for j, x in enumerate(X):

d[j] = cdist( x.todense(), Y, **kwargs ) [0]

elif sxy == 1:

for k, y in enumerate(Y):

d[:,k] = cdist( X, y.todense(), **kwargs ) [0]

else:

for j, x in enumerate(X):

for k, y in enumerate(Y):

d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0]

return d

def randomsample( X, n ):

""" random.sample of the rows of X

X may be sparse -- best csr

"""

sampleix = random.sample( xrange( X.shape[0] ), int(n) )

return X[sampleix]

def nearestcentres( X, centres, metric="euclidean", p=2 ):

""" each X -> nearest centre, any metric

euclidean2 (~ withinss) is more sensitive to outliers,

cityblock (manhattan, L1) less sensitive

"""

D = cdist( X, centres, metric=metric, p=p ) # |X| x |centres|

return D.argmin(axis=1)

def Lqmetric( x, y=None, q=.5 ):

# yes a metric, may increase weight of near matches; see ...

return (np.abs(x - y) ** q) .mean() if y is not None \

else (np.abs(x) ** q) .mean()

#...............................................................................

class Kmeans:

""" km = Kmeans( X, k= or centres=, ... )

in: either initial centres= for kmeans

or k= [nsample=] for kmeanssample

out: km.centres, km.Xtocentre, km.distances

iterator:

for jcentre, J in km:

clustercentre = centres[jcentre]

J indexes e.g. X[J], classes[J]

"""

def __init__( self, X, k=0, centres=None, nsample=0, **kwargs ):

self.X = X

if centres is None:

self.centres, self.Xtocentre, self.distances = kmeanssample(

X, k=k, nsample=nsample, **kwargs )

else:

self.centres, self.Xtocentre, self.distances = kmeans(

X, centres, **kwargs )

def __iter__(self):

for jc in range(len(self.centres)):

yield jc, (self.Xtocentre == jc)

#...............................................................................

if __name__ == "__main__":

import random

import sys

from time import time

N = 10000

dim = 10

ncluster = 10

kmsample = 100 # 0: random centres, > 0: kmeanssample

kmdelta = .001

kmiter = 10

metric = "cityblock" # "chebyshev" = max, "cityblock" L1, Lqmetric

seed = 1

exec( "\n".join( sys.argv[1:] )) # run this.py N= ...

np.set_printoptions( 1, threshold=200, edgeitems=5, suppress=True )

np.random.seed(seed)

random.seed(seed)

print "N %d dim %d ncluster %d kmsample %d metric %s" % (

N, dim, ncluster, kmsample, metric)

X = np.random.exponential( size=(N,dim) )

# cf scikits-learn datasets/

t0 = time()

if kmsample > 0:

centres, xtoc, dist = kmeanssample( X, ncluster, nsample=kmsample,

delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2 )

else:

randomcentres = randomsample( X, ncluster )

centres, xtoc, dist = kmeans( X, randomcentres,

delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2 )

print "%.0f msec" % ((time() - t0) * 1000)

# also ~/py/np/kmeans/test-kmeans.py

1)对于余弦距离，首先将所有数据向量归一化为| X | = 1; 然后

cosinedistance( X, Y ) = 1 - X . Y = Euclidean distance |X - Y|^2 / 2

很快对于位向量，请将规范与向量分开，而不是扩展为浮点数(尽管某些程序可能会为您扩展)。对于稀疏向量，说N，X的1％。Y应该花费时间O(2％N)，空间O(N)；但我不知道哪个程序可以做到这一点。

2) Scikit学习集群很好地概述了k均值，mini-batch-k均值...以及适用于scipy.sparse矩阵的代码。

3)务必在k均值之后检查群集大小。如果您期望群集大小大致相等，但它们出来了 [44 37 9 5 5] %……(令人头疼的声音)。

weixin_39701288

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python kmeans 基于metric_是否可以使用scikit-learn K-Means聚类指定自己的距离函数？...

这是一个小型的kmeans，它使用scipy.spatial.distance或用户函数中的20多个距离中的任意一个。欢迎发表评论(到目前为止，只有一个用户，这还不够)；特别是，您的N，dim，k公制是什么？#!/usr/bin/env python# kmeans.py using any of the 20-odd metrics in scipy.spatial.distance# kme...
复制链接

扫一扫