第一个:
import numpy as np
import matplotlib. pyplot as plt
from sklearn import manifold, datasets
digits = datasets. load_digits( n_class= 6 )
X, y = digits. data, digits. target
n_samples, n_features = X. shape
'''显示原始数据'''
n = 20
img = np. zeros( ( 10 * n, 10 * n) )
for i in range ( n) :
ix = 10 * i + 1
for j in range ( n) :
iy = 10 * j + 1
img[ ix: ix + 8 , iy: iy + 8 ] = X[ i * n + j] . reshape( ( 8 , 8 ) )
plt. figure( figsize= ( 8 , 8 ) )
plt. imshow( img, cmap= plt. cm. binary)
plt. xticks( [ ] )
plt. yticks( [ ] )
plt. show( )
type ( X)
numpy.ndarray
X[ 0 ]
array([ 0., 0., 5., 13., 9., 1., 0., 0., 0., 0., 13., 15., 10.,
15., 5., 0., 0., 3., 15., 2., 0., 11., 8., 0., 0., 4.,
12., 0., 0., 8., 8., 0., 0., 5., 8., 0., 0., 9., 8.,
0., 0., 4., 11., 0., 1., 12., 7., 0., 0., 2., 14., 5.,
10., 12., 0., 0., 0., 0., 6., 13., 10., 0., 0., 0.])
X. size
69312
X. shape
(1083, 64)
print ( n_samples, n_features)
1083 64
'''t-SNE'''
tsne = manifold. TSNE( n_components= 2 , init= 'pca' , random_state= 501 )
X_tsne = tsne. fit_transform( X)
print ( "Org data dimension is {}.Embedded data dimension is {}" . format ( X. shape[ - 1 ] , X_tsne. shape[ - 1 ] ) )
'''嵌入空间可视化'''
x_min, x_max = X_tsne. min ( 0 ) , X_tsne. max ( 0 )
X_norm = ( X_tsne - x_min) / ( x_max - x_min)
plt. figure( figsize= ( 8 , 8 ) )
for i in range ( X_norm. shape[ 0 ] ) :
plt. text( X_norm[ i, 0 ] , X_norm[ i, 1 ] , str ( y[ i] ) , color= plt. cm. Set1( y[ i] ) ,
fontdict= { 'weight' : 'bold' , 'size' : 20 } )
plt. xticks( [ ] )
plt. yticks( [ ] )
plt. show( )
Org data dimension is 64.Embedded data dimension is 2
第二个:
import pandas as pd
inputfile = r'consumption_data.xls'
k = 3
iteration = 500
data = pd. read_excel( inputfile, index_col = 'Id' )
data_zs = 1.0 * ( data - data. mean( ) ) / data. std( )
type ( data_zs)
pandas.core.frame.DataFrame
print ( data_zs. index)
Int64Index([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
...
933, 934, 935, 936, 937, 938, 939, 940, 941, 942],
dtype='int64', name='Id', length=940)
print ( data_zs. columns)
Index(['R', 'F', 'M'], dtype='object')
print ( data_zs[ 0 : 3 ] )
R F M
Id
1 0.764186 -0.493579 -1.158711
2 -1.024757 -0.630079 0.622527
3 -0.950217 0.871423 -0.341103
from sklearn. cluster import KMeans
model = KMeans( n_clusters = k, n_jobs = 4 , max_iter = iteration)
model. fit( data_zs)
r1 = pd. Series( model. labels_) . value_counts( )
r1
0 559
1 341
2 40
dtype: int64
r2 = pd. DataFrame( model. cluster_centers_)
r2
0 1 2 0 -0.149353 -0.658893 -0.271780 1 -0.160451 1.114802 0.392844 2 3.455055 -0.295654 0.449123
r = pd. concat( [ r2, r1] , axis = 1 )
r
0 1 2 0 0 -0.149353 -0.658893 -0.271780 559 1 -0.160451 1.114802 0.392844 341 2 3.455055 -0.295654 0.449123 40
r. columns = list ( data. columns) + [ u'类别数目' ]
print ( r)
R F M 类别数目
0 -0.149353 -0.658893 -0.271780 559
1 -0.160451 1.114802 0.392844 341
2 3.455055 -0.295654 0.449123 40
type ( model)
sklearn.cluster.k_means_.KMeans
model[ 0 ]
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-30-97b48cc27306> in <module>()
----> 1 model[0]
TypeError: 'KMeans' object does not support indexing
type ( model. labels_)
numpy.ndarray
model. labels_. size
940
model. cluster_centers_
array([[-0.14935336, -0.65889299, -0.27177977],
[-0.16045063, 1.11480154, 0.39284443],
[ 3.45505486, -0.29565357, 0.44912342]])
outputfile = 'data_type.xls'
r = pd. concat( [ data, pd. Series( model. labels_, index = data. index) ] , axis = 1 )
r. columns = list ( data. columns) + [ u'聚类类别' ]
r. to_excel( outputfile)
data_out = pd. read_excel( outputfile, index_col = 'Id' )
print ( data_out[ 0 : 3 ] )
R F M 聚类类别
Id
1 27 6 232.61 0
2 3 5 1507.11 0
3 4 16 817.62 1
from sklearn. manifold import TSNE
tsne = TSNE( )
tsne. fit_transform( data_zs)
tsne = pd. DataFrame( tsne. embedding_, index = data_zs. index)
import matplotlib. pyplot as plt
plt. rcParams[ 'font.sans-serif' ] = [ 'SimHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
d = tsne[ r[ u'聚类类别' ] == 0 ]
plt. plot( d[ 0 ] , d[ 1 ] , 'r.' )
d = tsne[ r[ u'聚类类别' ] == 1 ]
plt. plot( d[ 0 ] , d[ 1 ] , 'go' )
d = tsne[ r[ u'聚类类别' ] == 2 ]
plt. plot( d[ 0 ] , d[ 1 ] , 'b*' )
plt. show( )
print ( tsne[ 0 : 3 ] )
0 1
Id
1 13.233582 22.531153
2 15.523943 -25.674824
3 -30.262331 11.098033
print ( tsne. index. size)
940
r[ 0 : 3 ]
R F M 聚类类别 Id 1 27 6 232.61 0 2 3 5 1507.11 0 3 4 16 817.62 1
d = tsne[ r[ u'聚类类别' ] == 0 ]
d[ 0 : 3 ]
0 1 Id 1 13.233582 22.531153 2 15.523943 -25.674824 4 -24.099859 24.144758
d. index. size
559
plt. plot( d[ 0 ] , d[ 1 ] , 'r.' )
[<matplotlib.lines.Line2D at 0x2338ee6a7f0>]
d1 = tsne[ r[ u'聚类类别' ] == 1 ]
plt. plot( d1[ 0 ] , d1[ 1 ] , 'go' )
[<matplotlib.lines.Line2D at 0x2338efd7ef0>]
d = tsne[ r[ u'聚类类别' ] == 0 ]
plt. plot( d[ 0 ] , d[ 1 ] , 'r.' )
d = tsne[ r[ u'聚类类别' ] == 1 ]
plt. plot( d[ 0 ] , d[ 1 ] , 'go' )
d = tsne[ r[ u'聚类类别' ] == 2 ]
plt. plot( d[ 0 ] , d[ 1 ] , 'b*' )
plt. show( )
第二个例子来源于,张良均等著《python数据分析与挖掘实战》