import pandas as pd
import numpy as np
from sklearn. datasets. samples_generator import make_blobs
import matplotlib. pyplot as plt
import seaborn as sns
from sklearn import cluster
X, y = make_blobs( n_samples = 2000 , centers = [ [ - 1 , - 2 ] , [ 1 , 3 ] ] , cluster_std = [ 0.5 , 0.5 ] , random_state = 1234 )
plot_data = pd. DataFrame( np. column_stack( ( X, y) ) , columns = [ 'x1' , 'x2' , 'y' ] )
plt. style. use( 'ggplot' )
sns. lmplot( 'x1' , 'x2' , data = plot_data, hue = 'y' , markers = [ '^' , 'o' ] ,
fit_reg = False , legend = False )
plt. show( )
from sklearn import cluster
kmeans = cluster. KMeans( n_clusters= 2 , random_state= 1234 )
kmeans. fit( X)
dbscan = cluster. DBSCAN( eps = 0.5 , min_samples = 10 )
dbscan. fit( X)
plot_data[ 'kmeans_label' ] = kmeans. labels_
plot_data[ 'dbscan_label' ] = dbscan. labels_
plt. figure( figsize = ( 12 , 6 ) )
ax1 = plt. subplot2grid( shape = ( 1 , 2 ) , loc = ( 0 , 0 ) )
ax1. scatter( plot_data. x1, plot_data. x2, c = plot_data. kmeans_label)
ax2 = plt. subplot2grid( shape = ( 1 , 2 ) , loc = ( 0 , 1 ) )
ax2. scatter( plot_data. x1, plot_data. x2, c= plot_data. dbscan_label. map ( { - 1 : 1 , 0 : 2 , 1 : 0 } ) )
plt. show( )
from sklearn. datasets. samples_generator import make_moons
X1, y1 = make_moons( n_samples= 2000 , noise = 0.05 , random_state = 1234 )
X2, y2 = make_blobs( n_samples= 1000 , centers = [ [ 3 , 3 ] ] , cluster_std = 0.5 , random_state = 1234 )
y2 = np. where( y2 == 0 , 2 , 0 )
plot_data = pd. DataFrame( np. row_stack( [ np. column_stack( ( X1, y1) ) , np. column_stack( ( X2, y2) ) ] ) , columns = [ 'x1' , 'x2' , 'y' ] )
sns. lmplot( 'x1' , 'x2' , data = plot_data, hue = 'y' , markers = [ '^' , 'o' , '>' ] ,
fit_reg = False , legend = False )
plt. show( )
kmeans = cluster. KMeans( n_clusters= 3 , random_state= 1234 )
kmeans. fit( plot_data[ [ 'x1' , 'x2' ] ] )
dbscan = cluster. DBSCAN( eps = 0.3 , min_samples = 5 )
dbscan. fit( plot_data[ [ 'x1' , 'x2' ] ] )
plot_data[ 'kmeans_label' ] = kmeans. labels_
plot_data[ 'dbscan_label' ] = dbscan. labels_
plt. figure( figsize = ( 12 , 6 ) )
ax1 = plt. subplot2grid( shape = ( 1 , 2 ) , loc = ( 0 , 0 ) )
ax1. scatter( plot_data. x1, plot_data. x2, c = plot_data. kmeans_label)
ax2 = plt. subplot2grid( shape = ( 1 , 2 ) , loc = ( 0 , 1 ) )
ax2. scatter( plot_data. x1, plot_data. x2, c= plot_data. dbscan_label. map ( { - 1 : 1 , 0 : 0 , 1 : 3 , 2 : 2 } ) )
plt. show( )
X, y = make_blobs( n_samples = 2000 , centers = [ [ - 1 , 0 ] , [ 1 , 0.5 ] ] , cluster_std = [ 0.2 , 0.45 ] , random_state = 1234 )
plot_data = pd. DataFrame( np. column_stack( ( X, y) ) , columns = [ 'x1' , 'x2' , 'y' ] )
sns. lmplot( 'x1' , 'x2' , data = plot_data, hue = 'y' , markers = [ '^' , 'o' ] ,
fit_reg = False , legend = False )
plt. show( )
plt. figure( figsize = ( 16 , 5 ) )
ax1 = plt. subplot2grid( shape = ( 1 , 3 ) , loc = ( 0 , 0 ) )
agnes_min = cluster. AgglomerativeClustering( n_clusters = 2 , linkage= 'ward' )
agnes_min. fit( X)
ax1. scatter( X[ : , 0 ] , X[ : , 1 ] , c= agnes_min. labels_)
ax2 = plt. subplot2grid( shape = ( 1 , 3 ) , loc = ( 0 , 1 ) )
agnes_max = cluster. AgglomerativeClustering( n_clusters = 2 , linkage= 'complete' )
agnes_max. fit( X)
ax2. scatter( X[ : , 0 ] , X[ : , 1 ] , c= agnes_max. labels_)
ax2 = plt. subplot2grid( shape = ( 1 , 3 ) , loc = ( 0 , 2 ) )
agnes_avg = cluster. AgglomerativeClustering( n_clusters = 2 , linkage= 'average' )
agnes_avg. fit( X)
plt. scatter( X[ : , 0 ] , X[ : , 1 ] , c= agnes_avg. labels_)
plt. show( )
Province = pd. read_excel( r'C:\Users\Administrator\Desktop\Province.xlsx' )
Province. head( )
plt. scatter( Province. Birth_Rate, Province. Death_Rate, c = 'steelblue' )
plt. xlabel( 'Birth_Rate' )
plt. ylabel( 'Death_Rate' )
plt. show( )
from sklearn import preprocessing
predictors = [ 'Birth_Rate' , 'Death_Rate' ]
X = preprocessing. scale( Province[ predictors] )
X = pd. DataFrame( X)
res = [ ]
for eps in np. arange( 0.001 , 1 , 0.05 ) :
for min_samples in range ( 2 , 10 ) :
dbscan = cluster. DBSCAN( eps = eps, min_samples = min_samples)
dbscan. fit( X)
n_clusters = len ( [ i for i in set ( dbscan. labels_) if i != - 1 ] )
outliners = np. sum ( np. where( dbscan. labels_ == - 1 , 1 , 0 ) )
stats = str ( pd. Series( [ i for i in dbscan. labels_ if i != - 1 ] ) . value_counts( ) . values)
res. append( { 'eps' : eps, 'min_samples' : min_samples, 'n_clusters' : n_clusters, 'outliners' : outliners, 'stats' : stats} )
df = pd. DataFrame( res)
df. loc[ df. n_clusters == 3 , : ]
dbscan = cluster. DBSCAN( eps = 0.801 , min_samples = 3 )
dbscan. fit( X)
Province[ 'dbscan_label' ] = dbscan. labels_
sns. lmplot( x = 'Birth_Rate' , y = 'Death_Rate' , hue = 'dbscan_label' , data = Province,
markers = [ '*' , 'd' , '^' , 'o' ] , fit_reg = False , legend = False )
for x, y, text in zip ( Province. Birth_Rate, Province. Death_Rate, Province. Province) :
plt. text( x+ 0.1 , y- 0.1 , text, size = 8 )
plt. hlines( y = 5.8 , xmin = Province. Birth_Rate. min ( ) , xmax = Province. Birth_Rate. max ( ) ,
linestyles = '--' , colors = 'red' )
plt. vlines( x = 10 , ymin = Province. Death_Rate. min ( ) , ymax = Province. Death_Rate. max ( ) ,
linestyles = '--' , colors = 'red' )
plt. xlabel( 'Birth_Rate' )
plt. ylabel( 'Death_Rate' )
plt. show( )
agnes_min = cluster. AgglomerativeClustering( n_clusters = 3 , linkage= 'ward' )
agnes_min. fit( X)
Province[ 'agnes_label' ] = agnes_min. labels_
sns. lmplot( x = 'Birth_Rate' , y = 'Death_Rate' , hue = 'agnes_label' , data = Province,
markers = [ 'd' , '^' , 'o' ] , fit_reg = False , legend = False )
plt. xlabel( 'Birth_Rate' )
plt. ylabel( 'Death_Rate' )
plt. show( )
from sklearn import metricstrics
def k_silhouette ( X, clusters) :
K = range ( 2 , clusters+ 1 )
S = [ ]
for k in K:
kmeans = cluster. KMeans( n_clusters= k)
kmeans. fit( X)
labels = kmeans. labels_
S. append( metrics. silhouette_score( X, labels, metric= 'euclidean' ) )
plt. rcParams[ 'font.sans-serif' ] = [ 'Microsoft YaHei' ]
plt. rcParams[ 'axes.unicode_minus' ] = False
plt. style. use( 'ggplot' )
plt. plot( K, S, 'b*-' )
plt. xlabel( '簇的个数' )
plt. ylabel( '轮廓系数' )
plt. show( )
k_silhouette( X, clusters = 10 )
kmeans = cluster. KMeans( n_clusters = 3 )
kmeans. fit( X)
Province[ 'kmeans_label' ] = kmeans. labels_
sns. lmplot( x = 'Birth_Rate' , y = 'Death_Rate' , hue = 'kmeans_label' , data = Province,
markers = [ 'd' , '^' , 'o' ] , fit_reg = False , legend = False )
plt. xlabel( 'Birth_Rate' )
plt. ylabel( 'Death_Rate' )
plt. show( )