sklearn中KNN,NB,K-means,DBSCAN算法的实现(用法不全,如有错误,请指正)

sklearn中的KNN分类器

  • KNeighborsClassifier()
    • n_neighbors = 5:有几个邻居,最好取基数
    • algrithm = 'auto': 自动根据数据类型选择合适的算法,除此以外还可以选用其他算法,如‘ball_tree’:球树,‘kd_tree’:kd树
    • from sklearn.neighbors import KNeighborsClassifier
      from sklearn.datasets import make_blobs
      import numpy as np
      import pandas as pd
      #随机生成数据集,可以自己设置参数
      data = make_blobs(n_samples = 500, centers = 5, 
                        random_state = 8)
      # data
      
      #数据可视化
      import matplotlib.pyplot as plt
      %matplotlib inline
      plt.scatter(X[:,0],X[:,1], c = y, cmap = plt.cm.spring,
                  edgecolors = 'k')

可视化结果如下:

 对模型进行训练,并用matplotlib对结果进行可视化

#训练模型
clf = KNeighborsClassifier()
clf.fit(X, y)

#训练结果可视化
x_min, x_max = X[:,0].min() - 1, X[:,0].max() + 1
y_min, y_max = X[:,1].min() - 1, X[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                    np.arange(y_min, y_max, .02))
z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
z = z.reshape(xx.shape)
plt.pcolormesh(xx, yy, z, cmap = plt.cm.Pastel1)
plt.scatter(X[:, 0], X[:, 1], c = y, cmap = plt.cm.spring,
           edgecolors='k')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title('Classfier:KNN')

 可视化结果如下:

朴素贝叶斯算法的sklearn实现

  • naive_bayes.BernoulliNB:伯努利分布(二项分布)
  • navie_bayes.GaussianNB:高斯分布
  • navie_bayes.MultinomialNB:多项分布
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

X1, y1 = make_blobs(n_samples = 500, centers = 5, random_state = 8)

#划分上面自动生成的数据为测试集与训练集
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X1,y1,random_state = 8)

#可视化数据
%matplotlib inline          #魔法命令,将图形在jupyter显示出来
plt.scatter(X1[:,0], X1[:,1], c = y1, cmap = plt.cm.spring,edgecolors='k')

数据可视化如下: 

 

二项分布及结果可视化

nb = BernoulliNB()
nb.fit(X_train, y_train)
print('模型得分:{:.3f}'.format(nb.score(X_test, y_test))) 
#结果可视化
plt.figure(dpi=300)
x_min, x_max = X1[:,0].min() - 0.5, X1[:,0].max() + 0.5
y_min, y_max = X1[:,1].min() - 0.5, X1[:,1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                    np.arange(y_min, y_max, .02))
z = nb.predict(np.c_[xx.ravel(), yy.ravel()])
z = z.reshape(xx.shape)
plt.pcolormesh(xx, yy, z, cmap = plt.cm.Pastel1)
plt.scatter(X_train[:, 0], X_train[:, 1], c = y_train,
            cmap = plt.cm.cool,edgecolors='k')
plt.scatter(X_test[:, 0], X_test[:, 1], c = y_test,
            cmap = plt.cm.cool,marker='*')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title('Classfier:BernoulliNB')
plt.show()

 

高斯分布及结果可视化

gnb = GaussianNB()
gnb.fit(X_train, y_train)
print('模型得分:{:.3f}'.format(gnb.score(X_test, y_test)))
#结果可视化
plt.figure(dpi=300)
x_min, x_max = X1[:,0].min() - 0.5, X1[:,0].max() + 0.5
y_min, y_max = X1[:,1].min() - 0.5, X1[:,1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                    np.arange(y_min, y_max, .02))
z = gnb.predict(np.c_[xx.ravel(), yy.ravel()])
z = z.reshape(xx.shape)
plt.pcolormesh(xx, yy, z, cmap = plt.cm.Pastel1)
plt.scatter(X_train[:, 0], X_train[:, 1], c = y_train,
            cmap = plt.cm.cool,edgecolors='k')
plt.scatter(X_test[:, 0], X_test[:, 1], c = y_test,
            cmap = plt.cm.cool,marker='*')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title('Classfier:GaussianNB')
plt.show()

 

多项分布及结果可视化 

mnb = MultinomialNB()
mnb.fit(X_train, y_train)
print('模型得分:{:.3f}'.format(mnb.score(X_test, y_test)))

在运行上述代码时,可能会报错,原因可能是数据中存在负值,因此需要对数据进行一些处理,如下:

from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)
mnb = MultinomialNB()
mnb.fit(X_train_scaled, y_train)
print('模型得分:{:.3f}'.format(mnb.score(X_test, y_test)))
#拟合结果可视化
plt.figure(dpi=300)
z = mnb.predict(scaler.transform(np.c_[(xx.ravel(), yy.ravel())]))
z = z.reshape(xx.shape)
plt.pcolormesh(xx, yy, z, cmap = plt.cm.Pastel1)
plt.scatter(X_train[:, 0], X_train[:, 1], c = y_train,
            cmap = plt.cm.cool,edgecolors='k')
plt.scatter(X_test[:, 0], X_test[:, 1], c = y_test,
            cmap = plt.cm.cool,marker='*')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title('Classfier:MultinomialNB')
plt.show()

 

 威斯康星乳腺肿瘤分类预测

 

from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()
cancer.keys()

print('肿瘤的分类:',cancer['target_names'])
print('\n肿瘤的特征:\n', cancer.feature_names)
print('\n',len(cancer.feature_names))

#数据共有多少条
X, y = cancer.data, cancer.target
print(X.shape, y.shape)
X_train, X_test, y_train, y_test = train_test_split(X,y,
                                                   test_size = 0.25,
                                                   random_state = 8)

gnb = GaussianNB()
gnb.fit(X_train, y_train)
print(f'训练集得分:{gnb.score(X_train, y_train)}')
print(f'测试集得分:{gnb.score(X_test, y_test)}')

聚类算法的sklearn实现

一、KMeans算法实现

  • n_init = 10
  • max_iter = 300:最大迭代次数
  • tol = 0.0001:容忍度
  • copy_x = True:创建数据副本,不修改原数据
  • n_jobs = 1:调用现在空余的所有的CPU运行程序

    1.KMeans类的属性

  • cluster_centers_:查看聚类中心点的坐标
  • labels_:查看数据点的标签
  • intertia_:各样本与最近的类中心距离之和(簇内误差平方和)
from sklearn.cluster import KMeans

#随机生成150个类别数为3的数据集
X3,y3 = make_blobs(n_samples = 150,centers = 3,
                random_state = 8)

%matplotlib inline
plt.figure(dpi=300)
plt.scatter(X3[:,0],X3[:,1],c=y3,cmap=plt.cm.spring,
           edgecolors='k')

 

#对模型进行训练
kmeans = KMeans(n_clusters = 3)
kmeans.fit(X3)

#训练结果可视化
from matplotlib.colors import ListedColormap
#定义图像中分区的颜色和散点的颜色
cmap_light = ListedColormap(['#87CEFA','#CAFF70','#D8BFD8'])
cmap_bold = ListedColormap(['#1C1C1C','#104E8B','#68228B'])
#分别用样本的两个特征值创建图像,横轴和纵轴
x_min, x_max = X3[:,0].min() - 1, X3[:,0].max() + 1
y_min, y_max = X3[:,1].min() - 1, X3[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                    np.arange(y_min, y_max, .02))
z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
#给每个分类中的样本分配不同的颜色
z = z.reshape(xx.shape)
plt.pcolormesh(xx, yy, z, cmap = cmap_light)
#用散点把样本表示出来
plt.scatter(X3[:,0],X3[:,1], c=y3,cmap = plt.cm.spring,
           edgecolors='k',s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title('K-means Cluster')
plt.show()

 

'肘方法'确定k值 

#'肘方法'确定k值
distortion = []
for i in range(1,20):
    km = KMeans(n_clusters=i, init = 'k-means++', random_state = 8)
    km.fit(X)
    distortion.append(km.inertia_)
print(distortion)

#将聚类中心点可视化
#定义图像中分区的颜色和散点的颜色
cmap_light = ListedColormap(['#87CEFA','#CAFF70','#D8BFD8'])
cmap_bold = ListedColormap(['#1C1C1C','#104E8B','#68228B'])
#分别用样本的两个特征值创建图像,横轴和纵轴
x_min, x_max = X3[:,0].min() - 1, X3[:,0].max() + 1
y_min, y_max = X3[:,1].min() - 1, X3[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                    np.arange(y_min, y_max, .02))
z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
#给每个分类中的样本分配不同的颜色
z = z.reshape(xx.shape)
plt.pcolormesh(xx, yy, z, cmap = cmap_light)
#用散点把样本表示出来
plt.scatter(X3[:,0],X3[:,1], c=y3,cmap = plt.cm.spring,
           edgecolors='k',s=20)
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],
           s = 200,marker='*',c = 'red',label = 'centroids')
plt.legend()

 

计算轮廓系数

  • from sklearn.metrics import silhouette_samples
km = KMeans(n_clusters=3)
y_km = km.fit_predict(X3)
from matplotlib import cm
from sklearn.metrics import silhouette_samples
cluster_labels = np.unique(y_km)
n_clusters = cluster_labels.shape[0]
silhouette_vals = silhouette_samples(X3,y_km,metric='euclidean')
y_ax_lower, y_ax_upper = 0,0
yticks = []
for i ,c in enumerate(cluster_labels):
    c_silhouette_vals = silhouette_vals[y_km == c]
    c_silhouette_vals.sort()
    y_ax_upper += len(c_silhouette_vals)
    color = cm.jet(i/n_clusters)
    plt.barh(range(y_ax_lower,y_ax_upper),
            c_silhouette_vals,height = 1.0,
            edgecolor = 'none',color = color)
    yticks.append((y_ax_lower+y_ax_upper)/2)
    y_ax_lower += len(c_silhouette_vals)
silhouette_avg = np.mean(silhouette_vals)
plt.axvline(silhouette_avg,color = 'red',linestyle = '--')
plt.ylabel('Cluster')
plt.xlabel('Silhouette coefficient')

 

DBSCAN 

#生成模拟数据集
from sklearn.datasets import make_moons
X,y = make_moons(n_samples = 200,noise = 0.05,
                random_state = 8)
plt.scatter(X[:,0],X[:,1],c = y,cmap = plt.cm.spring,
            edgecolors = 'k')

 

X4,y4 = make_moons(n_samples = 200,
                  noise = 0,
                  random_state = 8)
plt.figure(dpi = 300)
plt.scatter(X4[:,0],X4[:,1],c = y4,cmap = plt.cm.spring,
            edgecolors = 'k')

 

#利用K-Means算法聚类
kmeans = KMeans(n_clusters = 2,init = 'random')
y_km = kmeans.fit_predict(X)

#结果可视化
#定义图像中分区的颜色和散点的颜色
cmap_light = ListedColormap(['#87CEFA','#CAFF70','#D8BFD8'])
cmap_bold = ListedColormap(['#1C1C1C','#104E8B','#68228B'])
#分别用样本的两个特征值创建图像,横轴和纵轴
x_min, x_max = X[:,0].min() - 1, X[:,0].max() + 1
y_min, y_max = X[:,1].min() - 1, X[:,1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                    np.arange(y_min, y_max, .02))
z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
#给每个分类中的样本分配不同的颜色
z = z.reshape(xx.shape)
plt.pcolormesh(xx, yy, z, cmap = cmap_light)
#用散点把样本表示出来
plt.scatter(X[:,0],X[:,1], c=y4,cmap = plt.cm.spring,
           edgecolors='k',s=20)
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],
           s = 200,marker='*',c = 'red',label = 'centroids')
plt.legend()
plt.grid()
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title('K-Means Cluster')

 

#导入DBSCAN
from sklearn.cluster import DBSCAN

db = DBSCAN(eps = 0.2,min_samples=5)
y_db = db.fit_predict(X)

#DBSCAN结果可视化
plt.scatter(X[y_db==0,0],X[y_db==0,1],c = 'red',
           marker='o',s=40,label='cluster 1',
            edgecolors='k')
plt.scatter(X[y_db==1,0],X[y_db==1,1],c = 'green',
            marker='s',s = 40,label = 'cluster 2',
           edgecolors = 'k')
plt.title('K-means clustering')
plt.legend()

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值