sklearn实践(KNN+NB+KMeans+DBSCAN)

本文介绍了如何在Python的sklearn库中使用KNN、朴素贝叶斯(包括伯努利、高斯和多项式分布)进行分类,以及K-Means和DBSCAN聚类算法的应用,以乳腺癌数据集和模拟数据为例,展示了模型的训练、可视化和预测过程。
摘要由CSDN通过智能技术生成

Sklearn中的KNN分类

#导入KNN分类器
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

from sklearn.datasets import make_blobs
data = make_blobs(n_samples = 200, centers = 2, random_state = 8 )
X, y = data

#KNN分类器实例化
clf = KNeighborsClassifier()

#模型训练
clf.fit(X,y)
#训练结果可视化
import matplotlib.pyplot as plt
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                     np.arange(y_min, y_max, .02))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap = plt.cm.Pastel1)
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.spring, edgecolor = 'k')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier:KNN")

#对新数据点进行可视化
plt.scatter(6.75,4.82, marker="*", c='red', s=200)
print('新数据点的分类是:',clf.predict([[6.75,4.82]]))

 

朴素贝叶斯算法(NB)

#导入朴素贝叶斯分类器
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

#导入数据集生成工具
from sklearn.datasets import make_blobs

#随机生成500个类别数是5的分类数据
X,y = make_blobs(n_samples = 500, centers = 5, random_state = 8 )

#数据可视化
import matplotlib.pyplot as plt
%matplotlib inline
plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.spring,edgecolors='k')
#拆分训练集和测试集
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=8)

#构建伯努利分布(二项分布)朴素贝叶斯分类器
nb = BernoulliNB()
nb.fit(X_train, y_train)
print('模型得分:{:.3f}'.format(nb.score(X_test, y_test)))

#拟合结果可视化
plt.figure(dpi=100)
import matplotlib.pyplot as plt
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                     np.arange(y_min, y_max, .02))
z = nb.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.pcolormesh(xx, yy, z, cmap = plt.cm.Pastel1)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.cool, edgecolor = 'k')
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=plt.cm.cool, marker = '*', edgecolor = 'k')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier:BernoulliNB")
plt.show()
#构建高斯分布(正态分布)朴素贝叶斯分类器
gnb = GaussianNB()
gnb.fit(X_train, y_train)
print('模型得分:{:.3f}'.format(nb.score(X_test, y_test)))

#拟合结果可视化
plt.figure(dpi=100)
import matplotlib.pyplot as plt
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                     np.arange(y_min, y_max, .02))
z = gnb.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.pcolormesh(xx, yy, z, cmap = plt.cm.Pastel1)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.cool, edgecolor = 'k')
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=plt.cm.cool, marker = '*', edgecolor = 'k')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier:GaussianNB")
plt.show()
#导入数据预处理工具
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

mnb = MultinomialNB()
mnb.fit(X_train_scaled, y_train)
print('模型得分:{:.3f}'.format(mnb.score(X_test, y_test)))

#拟合结果可视化
plt.figure(dpi=100)
import matplotlib.pyplot as plt
x_min, x_max = X[:, 0].min() - 0.5, X[:, 0].max() + 0.5
y_min, y_max = X[:, 1].min() - 0.5, X[:, 1].max() + 0.5
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                     np.arange(y_min, y_max, .02))
z = mnb.predict(np.c_[xx.ravel(), yy.ravel()]).reshape(xx.shape)
plt.pcolormesh(xx, yy, z, cmap = plt.cm.Pastel1)
plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.cool, edgecolor = 'k')
plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=plt.cm.cool, marker = '*', edgecolor = 'k')
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("Classifier:MultinomialNB")
plt.show()

威斯康星乳腺肿瘤分类预测

#导入数据集
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

X, y = cancer.data, cancer.target
print(X.shape, y.shape)

#拆分训练集和测试集
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.25, random_state=8)

#构建高斯分布(正态分布)朴素贝叶斯分类器
gnb = GaussianNB()
gnb.fit(X_train, y_train)
print('训练集得分:{:.3f}'.format(gnb.score(X_train, y_train)))
print('测试集得分:{:.3f}'.format(gnb.score(X_test, y_test)))

#随机选取一个样本进行测试
print('模型预测分类是:{}'.format(gnb.predict([X[312]])))
print('样本的正确分类是:',y[312])

#输出预测的概率值
gnb.predict_proba([X[312]])

K-Means算法

#随机生成含150个类别数为3的数据集
from sklearn.datasets import make_blobs
X,y = make_blobs(n_samples = 150, centers = 3, random_state = 8 )
#数据可视化
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(dpi=100)
plt.scatter(X[:,0],X[:,1],c=y,cmap=plt.cm.spring,edgecolors='k')

#导入KMeans模块
from sklearn.cluster import KMeans

#模型实例化
kmeans = KMeans(n_clusters=3, n_jobs=None)

#模型训练
kmeans.fit(X)

#训练结果可视化
import numpy as np
from matplotlib.colors import ListedColormap
#定义图像中分区的颜色和散点的颜色
cmap_light = ListedColormap(['#FFAAAA','#AAFFAA','#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000','#00FF00','#0000FF'])
#分别用样本的两个特征值创建图像和横轴和纵轴
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                     np.arange(y_min, y_max, .02))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
#给每个分类中的样本分配不同的颜色
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap =cmap_light)
#用散点把样本表示出来
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.spring, edgecolor = 'k', s=20)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("K-Means Cluster")
plt.show()
#查看聚类中心点的坐标
kmeans.cluster_centers_
#将聚类中心点可视化
import numpy as np
from matplotlib.colors import ListedColormap
#定义图像中分区的颜色和散点的颜色
cmap_light = ListedColormap(['#FFAAAA','#AAFFAA','#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000','#00FF00','#0000FF'])
#分别用样本的两个特征值创建图像和横轴和纵轴
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                     np.arange(y_min, y_max, .02))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
#给每个分类中的样本分配不同的颜色
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap =cmap_light)
#用散点把样本表示出来
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.spring, edgecolor = 'k', s=20)
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],
           s=200,marker='*',c='red',label='centroids')
plt.legend()
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("K-Means Cluster")
#查看数据点的标签
kmeans.labels_

#查看运行时迭代的次数
kmeans.n_iter_

DBSCAN

#生成模拟数据集
from sklearn.datasets import make_moons
X,y = make_moons(n_samples = 200, noise = 0.05, random_state = 8 )
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.spring, edgecolor = 'k')

from sklearn.datasets import make_moons
X1,y1 = make_moons(n_samples = 200, noise = 0, random_state = 8 )
plt.scatter(X1[:, 0], X1[:, 1], c=y1, cmap=plt.cm.spring, edgecolor = 'k')
#模型实例化
kmeans = KMeans(n_clusters=2, init='random', n_jobs=None)

#模型实例化
y_km = kmeans.fit_predict(X)

#将聚类中心点可视化
import numpy as np
from matplotlib.colors import ListedColormap
#定义图像中分区的颜色和散点的颜色
cmap_light = ListedColormap(['#FFAAAA','#AAFFAA','#AAAAFF'])
cmap_bold = ListedColormap(['#FF0000','#00FF00','#0000FF'])
#分别用样本的两个特征值创建图像和横轴和纵轴
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, .02),
                     np.arange(y_min, y_max, .02))
Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])
#给每个分类中的样本分配不同的颜色
Z = Z.reshape(xx.shape)
plt.pcolormesh(xx, yy, Z, cmap =cmap_light)
#用散点把样本表示出来
plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.spring, edgecolor = 'k', s=20)
plt.scatter(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],
           s=200,marker='*',c='red',label='centroids')
plt.legend()
plt.grid()
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.title("K-Means Cluster")
#结果可视化
plt.scatter(X[y_km==0,0],X[y_km==0,1],c='red',marker='o',s=40,label='cluster 1',edgecolor='k',)
plt.scatter(X[y_km==1,0],X[y_km==1,1],c='green',marker='s',s=40,label='cluster 2',edgecolor='k',)
plt.title('DBSCAN clustering')
plt.legend()
#导入DBSCAN模块
from sklearn.cluster import DBSCAN
#模型实例化
db = DBSCAN(eps=0.2,min_samples=5)
#训练模型并预测
y_db = db.fit_predict(X)

#DBSCAN结果可视化
plt.scatter(X[y_db==0,0],X[y_db==0,1],c='red',marker='o',s=40,label='cluster 1',edgecolor='k',)
plt.scatter(X[y_db==1,0],X[y_db==1,1],c='green',marker='s',s=40,label='cluster 2',edgecolor='k',)
plt.title('DBSCAN clustering')
plt.legend()
#DBSCAN
plt.figure(dpi=100)
import pandas as pd
dbscan_data = pd.read_csv("dbscan_data.csv")
dbscan_data.head()
plt.scatter(dbscan_data['x1'], dbscan_data['x2'])
#导入DBSCAN模块
from sklearn.cluster import DBSCAN
#模型实例化
db = DBSCAN(eps=0.5,min_samples=100)
db.fit(dbscan_data)

labels6=db.labels_
dbscan_data['cluster_db']=labels6
colors=np.array(['red'])
plt.figure(figsize = (15,8))
plt.scatter(dbscan_data['x1'],dbscan_data['x2'],c=colors[dbscan_data['cluster_db']])
db = DBSCAN(eps=0.2,min_samples=100).fit(dbscan_data)
labels6=db.labels_
dbscan_data['cluster_db']=labels6
#colors=np.random.random(100)
colors=np.array(['red','green','blue','yellow','teal','orange','black','goldenrod','tomato'])
plt.figure(figsize = (15,8))
plt.scatter(dbscan_data['x1'],dbscan_data['x2'],c=colors[dbscan_data['cluster_db']])
from sklearn.cluster import DBSCAN
#模型实例化
db = DBSCAN(eps=0.2,min_samples=30)
db.fit(dbscan_data)
labels6=db.labels_
dbscan_data['cluster_db']=labels6
#colors=np.random.random(100)
colors=np.array(['red','green','blue','yellow','teal','orange','cyan','black','goldenrod','tomato','#123456','#563211','green'])
plt.figure(figsize = (15,8))
plt.scatter(dbscan_data['x1'],dbscan_data['x2'],c=colors[dbscan_data['cluster_db']])

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值