python kmeans 基于metric_是否可以使用scikit-learn K-Means聚类指定自己的距离函数?...

这是一个小型的kmeans,它使用scipy.spatial.distance或用户函数中的20多个距离中的 任意一个。

欢迎发表评论(到目前为止,只有一个用户,这还不够);特别是,您的N,dim,k公制是什么?

#!/usr/bin/env python

# kmeans.py using any of the 20-odd metrics in scipy.spatial.distance

# kmeanssample 2 pass, first sample sqrt(N)

from __future__ import division

import random

import numpy as np

from scipy.spatial.distance import cdist  # $scipy/spatial/distance.py

# http://docs.scipy.org/doc/scipy/reference/spatial.html

from scipy.sparse import issparse  # $scipy/sparse/csr.py

__date__ = "2011-11-17 Nov denis"

# X sparse, any cdist metric: real app ?

# centres get dense rapidly, metrics in high dim hit distance whiteout

# vs unsupervised / semi-supervised svm

#...............................................................................

def kmeans( X, centres, delta=.001, maxiter=10, metric="euclidean", p=2, verbose=1 ):

""" centres, Xtocentre, distances = kmeans( X, initial centres ... )

in:

X N x dim  may be sparse

centres k x dim: initial centres, e.g. random.sample( X, k )

delta: relative error, iterate until the average distance to centres

is within delta of the previous average distance

maxiter

metric: any of the 20-odd in scipy.spatial.distance

"chebyshev" = max, "cityblock" = L1, "minkowski" with p=

or a function( Xvec, centrevec ), e.g. Lqmetric below

p: for minkowski metric -- local mod cdist for 0 < p < 1 too

verbose: 0 silent, 2 prints running distances

out:

centres, k x dim

Xtocentre: each X -> its nearest centre, ints N -> k

distances, N

see also: kmeanssample below, class Kmeans below.

"""

if not issparse(X):

X = np.asanyarray(X)  # ?

centres = centres.todense() if issparse(centres) \

else centres.copy()

N, dim = X.shape

k, cdim = centres.shape

if dim != cdim:

raise ValueError( "kmeans: X %s and centres %s must have the same number of columns" % (

X.shape, centres.shape ))

if verbose:

print "kmeans: X %s  centres %s  delta=%.2g  maxiter=%d  metric=%s" % (

X.shape, centres.shape, delta, maxiter, metric)

allx = np.arange(N)

prevdist = 0

for jiter in range( 1, maxiter+1 ):

D = cdist_sparse( X, centres, metric=metric, p=p )  # |X| x |centres|

xtoc = D.argmin(axis=1)  # X -> nearest centre

distances = D[allx,xtoc]

avdist = distances.mean()  # median ?

if verbose >= 2:

print "kmeans: av |X - nearest centre| = %.4g" % avdist

if (1 - delta) * prevdist <= avdist <= prevdist \

or jiter == maxiter:

break

prevdist = avdist

for jc in range(k):  # (1 pass in C)

c = np.where( xtoc == jc )[0]

if len(c) > 0:

centres[jc] = X[c].mean( axis=0 )

if verbose:

print "kmeans: %d iterations  cluster sizes:" % jiter, np.bincount(xtoc)

if verbose >= 2:

r50 = np.zeros(k)

r90 = np.zeros(k)

for j in range(k):

dist = distances[ xtoc == j ]

if len(dist) > 0:

r50[j], r90[j] = np.percentile( dist, (50, 90) )

print "kmeans: cluster 50 % radius", r50.astype(int)

print "kmeans: cluster 90 % radius", r90.astype(int)

# scale L1 / dim, L2 / sqrt(dim) ?

return centres, xtoc, distances

#...............................................................................

def kmeanssample( X, k, nsample=0, **kwargs ):

""" 2-pass kmeans, fast for large N:

1) kmeans a random sample of nsample ~ sqrt(N) from X

2) full kmeans, starting from those centres

"""

# merge w kmeans ? mttiw

# v large N: sample N^1/2, N^1/2 of that

# seed like sklearn ?

N, dim = X.shape

if nsample == 0:

nsample = max( 2*np.sqrt(N), 10*k )

Xsample = randomsample( X, int(nsample) )

pass1centres = randomsample( X, int(k) )

samplecentres = kmeans( Xsample, pass1centres, **kwargs )[0]

return kmeans( X, samplecentres, **kwargs )

def cdist_sparse( X, Y, **kwargs ):

""" -> |X| x |Y| cdist array, any cdist metric

X or Y may be sparse -- best csr

"""

# todense row at a time, v slow if both v sparse

sxy = 2*issparse(X) + issparse(Y)

if sxy == 0:

return cdist( X, Y, **kwargs )

d = np.empty( (X.shape[0], Y.shape[0]), np.float64 )

if sxy == 2:

for j, x in enumerate(X):

d[j] = cdist( x.todense(), Y, **kwargs ) [0]

elif sxy == 1:

for k, y in enumerate(Y):

d[:,k] = cdist( X, y.todense(), **kwargs ) [0]

else:

for j, x in enumerate(X):

for k, y in enumerate(Y):

d[j,k] = cdist( x.todense(), y.todense(), **kwargs ) [0]

return d

def randomsample( X, n ):

""" random.sample of the rows of X

X may be sparse -- best csr

"""

sampleix = random.sample( xrange( X.shape[0] ), int(n) )

return X[sampleix]

def nearestcentres( X, centres, metric="euclidean", p=2 ):

""" each X -> nearest centre, any metric

euclidean2 (~ withinss) is more sensitive to outliers,

cityblock (manhattan, L1) less sensitive

"""

D = cdist( X, centres, metric=metric, p=p )  # |X| x |centres|

return D.argmin(axis=1)

def Lqmetric( x, y=None, q=.5 ):

# yes a metric, may increase weight of near matches; see ...

return (np.abs(x - y) ** q) .mean() if y is not None \

else (np.abs(x) ** q) .mean()

#...............................................................................

class Kmeans:

""" km = Kmeans( X, k= or centres=, ... )

in: either initial centres= for kmeans

or k= [nsample=] for kmeanssample

out: km.centres, km.Xtocentre, km.distances

iterator:

for jcentre, J in km:

clustercentre = centres[jcentre]

J indexes e.g. X[J], classes[J]

"""

def __init__( self, X, k=0, centres=None, nsample=0, **kwargs ):

self.X = X

if centres is None:

self.centres, self.Xtocentre, self.distances = kmeanssample(

X, k=k, nsample=nsample, **kwargs )

else:

self.centres, self.Xtocentre, self.distances = kmeans(

X, centres, **kwargs )

def __iter__(self):

for jc in range(len(self.centres)):

yield jc, (self.Xtocentre == jc)

#...............................................................................

if __name__ == "__main__":

import random

import sys

from time import time

N = 10000

dim = 10

ncluster = 10

kmsample = 100  # 0: random centres, > 0: kmeanssample

kmdelta = .001

kmiter = 10

metric = "cityblock"  # "chebyshev" = max, "cityblock" L1,  Lqmetric

seed = 1

exec( "\n".join( sys.argv[1:] ))  # run this.py N= ...

np.set_printoptions( 1, threshold=200, edgeitems=5, suppress=True )

np.random.seed(seed)

random.seed(seed)

print "N %d  dim %d  ncluster %d  kmsample %d  metric %s" % (

N, dim, ncluster, kmsample, metric)

X = np.random.exponential( size=(N,dim) )

# cf scikits-learn datasets/

t0 = time()

if kmsample > 0:

centres, xtoc, dist = kmeanssample( X, ncluster, nsample=kmsample,

delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2 )

else:

randomcentres = randomsample( X, ncluster )

centres, xtoc, dist = kmeans( X, randomcentres,

delta=kmdelta, maxiter=kmiter, metric=metric, verbose=2 )

print "%.0f msec" % ((time() - t0) * 1000)

# also ~/py/np/kmeans/test-kmeans.py

1)对于余弦距离,首先将所有数据向量归一化为| X | = 1; 然后

cosinedistance( X, Y ) = 1 - X . Y = Euclidean distance |X - Y|^2 / 2

很快 对于位向量,请将规范与向量分开,而不是扩展为浮点数(尽管某些程序可能会为您扩展)。对于稀疏向量,说N,X的1%。Y应该花费时间O(2%N),空间O(N);但我不知道哪个程序可以做到这一点。

2) Scikit学习集群 很好地概述了k均值,mini-batch-k均值...以及适用于scipy.sparse矩阵的代码。

3)务必在k均值之后检查群集大小。如果您期望群集大小大致相等,但它们出来了 [44 37  9  5  5] %……(令人头疼的声音)。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
K-means是常用的聚类算法之一,它的主要思想是将数据点分为K个簇,使得同一簇内的点相似度较高,不同簇之间的点相似度较低。在scikit-learn中,KMeans聚类算法已经实现,可以方便地进行聚类操作。 本文将介绍使用scikit-learn中的KMeans聚类算法进行聚类的步骤和实现方法,并介绍MiniBatchKMeans使用。 ## 1. 数据准备 我们先生成一个随机数据集,用于演示KMeans聚类: ```python import numpy as np # 生成随机数据 np.random.seed(0) X = np.random.randn(1000, 2) # 生成1000个二维数据点 ``` ## 2. 模型训练 接下来,我们使用KMeans模型对数据进行聚类: ```python from sklearn.cluster import KMeans # 构建模型 kmeans = KMeans(n_clusters=3, random_state=0) # 训练模型 kmeans.fit(X) ``` 这里选择将数据分为3个簇,可以根据实际情况进行调整。训练完成后,我们可以查看簇中心点的位置: ```python print(kmeans.cluster_centers_) ``` 输出: ``` [[ 0.05161133 -0.96525049] [ 1.06359705 -0.02646225] [-0.9680658 0.04252211]] ``` ## 3. 预测和评估 训练完成后,我们可以使用训练好的模型对新数据进行预测: ```python # 预测新数据 y_pred = kmeans.predict(X) ``` 对于聚类算法,我们可以使用轮廓系数(Silhouette Coefficient)评估聚类效果。轮廓系数是一种衡量聚类质量的指标,取值范围在[-1, 1]之间,越接近1表示聚类效果越好。在scikit-learn中,可以使用metrics.silhouette_score来计算轮廓系数: ```python from sklearn import metrics # 计算轮廓系数 score = metrics.silhouette_score(X, y_pred) print(score) ``` 输出: ``` 0.6011942331016043 ``` ## 4. MiniBatchKMeans KMeans聚类算法的一个问题是它对于大规模数据的聚类会比较慢。因此,scikit-learn中还提供了MiniBatchKMeans算法,它可以加快聚类速度。 MiniBatchKMeans使用方法与KMeans类似: ```python from sklearn.cluster import MiniBatchKMeans # 构建模型 mbkmeans = MiniBatchKMeans(n_clusters=3, random_state=0) # 训练模型 mbkmeans.fit(X) # 预测新数据 y_pred = mbkmeans.predict(X) # 计算轮廓系数 score = metrics.silhouette_score(X, y_pred) print(score) ``` 需要注意的是,MiniBatchKMeans算法在聚类效果上可能会稍微劣于KMeans算法,但是速度更加快捷。在处理大规模数据时,可以优先考虑使用MiniBatchKMeans算法。 本文介绍了使用scikit-learn中的KMeans聚类算法进行聚类的步骤和实现方法,并介绍了MiniBatchKMeans使用。在实际应用中,可以根据实际情况选择不同的聚类算法和参数。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值