机器学习之对鸢尾花数据集和月亮数据集,分别采用线性LDA、k-means和SVM算法进行二分类可视化分析

一、采用线性LDA算法进行二分类可视化分析

1、鸢尾花数据集

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets.samples_generator import make_classification

class LDA():
    def Train(self, X, y):
        """X为训练数据集,y为训练label"""
        X1 = np.array([X[i] for i in range(len(X)) if y[i] == 0])
        X2 = np.array([X[i] for i in range(len(X)) if y[i] == 1])

        # 求中心点
        mju1 = np.mean(X1, axis=0)  # mju1是ndrray类型
        mju2 = np.mean(X2, axis=0)

        # dot(a, b, out=None) 计算矩阵乘法
        cov1 = np.dot((X1 - mju1).T, (X1 - mju1))
        cov2 = np.dot((X2 - mju2).T, (X2 - mju2))
        Sw = cov1 + cov2
        # 计算w
        w = np.dot(np.mat(Sw).I, (mju1 - mju2).reshape((len(mju1), 1)))
        # 记录训练结果
        self.mju1 = mju1  # 第1类的分类中心
        self.cov1 = cov1
        self.mju2 = mju2  # 第2类的分类中心
        self.cov2 = cov2
        self.Sw = Sw  # 类内散度矩阵
        self.w = w  # 判别权重矩阵
    def Test(self, X, y):
        """X为测试数据集,y为测试label"""
        # 分类结果
        y_new = np.dot((X), self.w)
        # 计算fisher线性判别式
        nums = len(y)
        c1 = np.dot((self.mju1 - self.mju2).reshape(1, (len(self.mju1))), np.mat(self.Sw).I)
        c2 = np.dot(c1, (self.mju1 + self.mju2).reshape((len(self.mju1), 1)))
        c = 1/2 * c2  # 2个分类的中心
        h = y_new - c
        # 判别
        y_hat = []
        for i in range(nums):
            if h[i] >= 0:
                y_hat.append(0)
            else:
                y_hat.append(1)
        # 计算分类精度
        count = 0
        for i in range(nums):
            if y_hat[i] == y[i]:
                count += 1
        precise = count / nums
        # 显示信息
        print("测试样本数量:", nums)
        print("预测正确样本的数量:", count)
        print("测试准确度:", precise)
        return precise
if '__main__' == __name__:
    # 产生分类数据
    n_samples = 500
    X, y = make_classification(n_samples=n_samples, n_features=2, n_redundant=0, n_classes=2,n_informative=1, n_clusters_per_class=1, class_sep=0.5, random_state=10)
    # LDA线性判别分析(二分类)
    lda = LDA()
    # 60% 用作训练,40%用作测试
    Xtrain = X[:299, :]
    Ytrain = y[:299]
    Xtest = X[300:, :]
    Ytest = y[300:]
    lda.Train(Xtrain, Ytrain)
    precise = lda.Test(Xtest, Ytest)
    # 原始数据
    plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
    plt.xlabel("x1")
    plt.ylabel("x2")
    plt.title("Test precise:" + str(precise))
    plt.show()

运行结果:
在这里插入图片描述

2、月亮数据集

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
class LDA():
    def Train(self, X, y):
        """X为训练数据集,y为训练label"""
        X1 = np.array([X[i] for i in range(len(X)) if y[i] == 0])
        X2 = np.array([X[i] for i in range(len(X)) if y[i] == 1])
        # 求中心点
        mju1 = np.mean(X1, axis=0)  # mju1是ndrray类型
        mju2 = np.mean(X2, axis=0)
        # dot(a, b, out=None) 计算矩阵乘法
        cov1 = np.dot((X1 - mju1).T, (X1 - mju1))
        cov2 = np.dot((X2 - mju2).T, (X2 - mju2))
        Sw = cov1 + cov2
        # 计算w
        w = np.dot(np.mat(Sw).I, (mju1 - mju2).reshape((len(mju1), 1)))
        # 记录训练结果
        self.mju1 = mju1  # 第1类的分类中心
        self.cov1 = cov1
        self.mju2 = mju2  # 第1类的分类中心
        self.cov2 = cov2
        self.Sw = Sw  # 类内散度矩阵
        self.w = w  # 判别权重矩阵
    def Test(self, X, y):
        """X为测试数据集,y为测试label"""
        # 分类结果
        y_new = np.dot((X), self.w)
        # 计算fisher线性判别式
        nums = len(y)
        c1 = np.dot((self.mju1 - self.mju2).reshape(1, (len(self.mju1))), np.mat(self.Sw).I)
        c2 = np.dot(c1, (self.mju1 + self.mju2).reshape((len(self.mju1), 1)))
        c = 1/2 * c2  # 2个分类的中心
        h = y_new - c
        # 判别
        y_hat = []
        for i in range(nums):
            if h[i] >= 0:
                y_hat.append(0)
            else:
                y_hat.append(1)
        # 计算分类精度
        count = 0
        for i in range(nums):
            if y_hat[i] == y[i]:
                count += 1
        precise = count / (nums+0.000001)
        # 显示信息
        print("测试样本数量:", nums)
        print("预测正确样本的数量:", count)
        print("测试准确度:", precise)
        return precise
if '__main__' == __name__:
    # 产生分类数据
    X, y = make_moons(n_samples=100, noise=0.15, random_state=42)
    # LDA线性判别分析(二分类)
    lda = LDA()
    # 60% 用作训练,40%用作测试
    Xtrain = X[:60, :]
    Ytrain = y[:60]
    Xtest = X[40:, :]
    Ytest = y[40:]
    lda.Train(Xtrain, Ytrain)
    precise = lda.Test(Xtest, Ytest)
    # 原始数据
    plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
    plt.xlabel("x1")
    plt.ylabel("x2")
    plt.title("Test precise:" + str(precise))
    plt.show()

运行结果:
在这里插入图片描述

二、采用k-means算法进行二分类可视化分析

1、鸢尾花数据集

#鸢尾花数据集的K-means分类
from sklearn import datasets
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
 
#加载数据集,是一个字典类似Java中的map
lris_df = datasets.load_iris()
 
#挑选出前两个维度作为x轴和y轴,你也可以选择其他维度
x_axis = lris_df.data[:,0]
y_axis = lris_df.data[:,2]
 
 
#这里已经知道了分2类,其他分类这里的参数需要调试
model = KMeans(n_clusters=2)
 
#训练模型
model.fit(lris_df.data)
 
#选取行标为100的那条数据,进行预测
prddicted_label= model.predict([[6.3, 3.3, 6, 2.5]])
 
#预测全部150条数据
all_predictions = model.predict(lris_df.data)
 
#打印出来对150条数据的聚类散点图
plt.scatter(x_axis, y_axis, c=all_predictions)
plt.show()

运行结果:
在这里插入图片描述

2、月亮数据集

import matplotlib.pyplot as plt
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=100, noise=0.15, random_state=42)
estimator = KMeans(n_clusters=5)#构造聚类器
estimator.fit(X)#聚类
label_pred = estimator.labels_ #获取聚类标签
#绘制k-means结果
x0 = X[label_pred == 0]
x1 = X[label_pred == 1]
x2 = X[label_pred == 2]
x3 = X[label_pred == 3]
plt.scatter(x0[:, 0], x0[:, 1], c = "red", marker='o', label='label0')
plt.scatter(x1[:, 0], x1[:, 1], c = "green", marker='*', label='label1')
#plt.scatter(x2[:, 0], x2[:, 1], c = "blue", marker='+', label='label2')
#plt.scatter(x3[:, 0], x3[:, 1], c = "yellow", marker='o', label='label3')
plt.xlabel('petal length')
plt.ylabel('petal width')
plt.legend(loc=2)
plt.show()

运行结果:
在这里插入图片描述

三、采用SVM算法进行二分类可视化分析

1、鸢尾花数据集

from sklearn.svm import SVC
from sklearn import datasets
import matplotlib.pyplot as plt
import numpy as np
iris = datasets.load_iris()
X = iris["data"][:, (2, 3)]  # 花瓣长度与花瓣宽度  petal length, petal width
y = iris["target"]
setosa_or_versicolor = (y == 0) | (y == 1)
X = X[setosa_or_versicolor]
y = y[setosa_or_versicolor]
# SVM Classifier model
svm_clf = SVC(kernel="linear", C=float("inf"))
svm_clf.fit(X, y)
def plot_svc_decision_boundary(svm_clf, xmin, xmax):
    # 获取决策边界的w和b
    w = svm_clf.coef_[0]
    b = svm_clf.intercept_[0]
    # At the decision boundary, w0*x0 + w1*x1 + b = 0
    # => x1 = -w0/w1 * x0 - b/w1
    x0 = np.linspace(xmin, xmax, 200)
    # 画中间的粗线
    decision_boundary = -w[0]/w[1] * x0 - b/w[1]
    # 计算间隔
    margin = 1/w[1]
    gutter_up = decision_boundary + margin
    gutter_down = decision_boundary - margin
    # 获取支持向量
    svs = svm_clf.support_vectors_
    plt.scatter(svs[:, 0], svs[:, 1], s=180, facecolors='#FFAAAA')
    plt.plot(x0, decision_boundary, "k-", linewidth=2)
    plt.plot(x0, gutter_up, "k--", linewidth=2)
    plt.plot(x0, gutter_down, "k--", linewidth=2)
plt.title("大间隔分类", fontsize=16)
plt.rcParams['font.sans-serif']=['SimHei'] #显示中文标签
plt.rcParams['axes.unicode_minus']=False
plot_svc_decision_boundary(svm_clf, 0, 5.5)
plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs")
plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo")
plt.xlabel("Petal length", fontsize=14)
plt.axis([0, 5.5, 0, 2])
plt.show()

运行结果:
在这里插入图片描述

2、月亮数据集

import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib as mpl
from sklearn.datasets import make_moons
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
# 为了显示中文
mpl.rcParams['font.sans-serif'] = [u'SimHei']
mpl.rcParams['axes.unicode_minus'] = False
X, y = make_moons(n_samples=100, noise=0.15, random_state=42)
def plot_dataset(X, y, axes):
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], "bs")
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], "g^")
    plt.axis(axes)
    plt.grid(True, which='both')
    plt.xlabel(r"$x_1$", fontsize=20)
    plt.ylabel(r"$x_2$", fontsize=20, rotation=0)
    plt.title("月亮数据",fontsize=20)
plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
plt.show()

结果:
在这里插入图片描述

polynomial_svm_clf = Pipeline([
        # 将源数据 映射到 3阶多项式
        ("poly_features", PolynomialFeatures(degree=3)),
        # 标准化
        ("scaler", StandardScaler()),
        # SVC线性分类器
        ("svm_clf", LinearSVC(C=10, loss="hinge", random_state=42))
    ])
polynomial_svm_clf.fit(X, y)
def plot_predictions(clf, axes):
    # 打表
    x0s = np.linspace(axes[0], axes[1], 100)
    x1s = np.linspace(axes[2], axes[3], 100)
    x0, x1 = np.meshgrid(x0s, x1s)
    X = np.c_[x0.ravel(), x1.ravel()]
    y_pred = clf.predict(X).reshape(x0.shape)
    y_decision = clf.decision_function(X).reshape(x0.shape)
#     print(y_pred)
#     print(y_decision)  
    plt.contourf(x0, x1, y_pred, cmap=plt.cm.brg, alpha=0.2)
    plt.contourf(x0, x1, y_decision, cmap=plt.cm.brg, alpha=0.1)
plot_predictions(polynomial_svm_clf, [-1.5, 2.5, -1, 1.5])
plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
plt.show()

结果:
在这里插入图片描述

from sklearn.svm import SVC
gamma1, gamma2 = 0.1, 5
C1, C2 = 0.001, 1000
hyperparams = (gamma1, C1), (gamma1, C2)
svm_clfs = []
for gamma, C in hyperparams:
    rbf_kernel_svm_clf = Pipeline([
            ("scaler", StandardScaler()),
            ("svm_clf", SVC(kernel="rbf", gamma=gamma, C=C))
        ])
    rbf_kernel_svm_clf.fit(X, y)
    svm_clfs.append(rbf_kernel_svm_clf)
plt.figure(figsize=(11, 7))
for i, svm_clf in enumerate(svm_clfs):
    plt.subplot(221 + i)
    plot_predictions(svm_clf, [-1.5, 2.5, -1, 1.5])
    plot_dataset(X, y, [-1.5, 2.5, -1, 1.5])
    gamma, C = hyperparams[i]
    plt.title(r"$\gamma = {}, C = {}$".format(gamma, C), fontsize=16)
plt.tight_layout()
plt.show()

结果:
在这里插入图片描述

四、SVM算法优缺点

算法优点:  
    (1)使用核函数可以向高维空间进行映射
    (2)使用核函数可以解决非线性的分类
    (3)分类思想很简单,就是将样本与决策面的间隔最大化
    (4)分类效果较好

算法缺点:
    (1)SVM算法对大规模训练样本难以实施
    (2)用SVM解决多分类问题存在困难
    (3)对缺失数据敏感,对参数和核函数的选择敏感  
参考文献
参考地址

  • 2
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
好的,以下是用Fisher线性判别和支持向量机进行Oxford17Flower数据集分类的代码示例: 首先,我们需要加载Oxford17Flower数据集,这里我们使用sklearn中的datasets库: ```python from sklearn.datasets import load_sample_images from sklearn.model_selection import train_test_split # 加载Oxford17Flower数据集 data = load_sample_images() X = data.images y = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] # 将数据集分成训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) ``` 接下来,我们使用Fisher线性判别对数据集进行分类: ```python from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.metrics import accuracy_score # 使用Fisher线性判别进行分类 lda = LinearDiscriminantAnalysis() X_train_lda = lda.fit_transform(X_train.reshape(len(X_train), -1), y_train) X_test_lda = lda.transform(X_test.reshape(len(X_test), -1)) clf_lda = SVC(kernel='linear') clf_lda.fit(X_train_lda, y_train) # 计算分类准确率 y_pred_lda = clf_lda.predict(X_test_lda) acc_lda = accuracy_score(y_test, y_pred_lda) print("Fisher Linear Discriminant Accuracy: {:.2f}%".format(acc_lda*100)) ``` 最后,我们使用支持向量机数据集进行分类: ```python from sklearn.svm import SVC # 使用支持向量机进行分类 clf_svm = SVC(kernel='rbf') clf_svm.fit(X_train.reshape(len(X_train), -1), y_train) # 计算分类准确率 y_pred_svm = clf_svm.predict(X_test.reshape(len(X_test), -1)) acc_svm = accuracy_score(y_test, y_pred_svm) print("SVM Accuracy: {:.2f}%".format(acc_svm*100)) ``` 完整代码如下: ```python from sklearn.datasets import load_sample_images from sklearn.model_selection import train_test_split from sklearn.discriminant_analysis import LinearDiscriminantAnalysis from sklearn.svm import SVC from sklearn.metrics import accuracy_score # 加载Oxford17Flower数据集 data = load_sample_images() X = data.images y = [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1] # 将数据集分成训练集和测试集 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 使用Fisher线性判别进行分类 lda = LinearDiscriminantAnalysis() X_train_lda = lda.fit_transform(X_train.reshape(len(X_train), -1), y_train) X_test_lda = lda.transform(X_test.reshape(len(X_test), -1)) clf_lda = SVC(kernel='linear') clf_lda.fit(X_train_lda, y_train) # 计算分类准确率 y_pred_lda = clf_lda.predict(X_test_lda) acc_lda = accuracy_score(y_test, y_pred_lda) print("Fisher Linear Discriminant Accuracy: {:.2f}%".format(acc_lda*100)) # 使用支持向量机进行分类 clf_svm = SVC(kernel='rbf') clf_svm.fit(X_train.reshape(len(X_train), -1), y_train) # 计算分类准确率 y_pred_svm = clf_svm.predict(X_test.reshape(len(X_test), -1)) acc_svm = accuracy_score(y_test, y_pred_svm) print("SVM Accuracy: {:.2f}%".format(acc_svm*100)) ``` 这样,我们就完成了用Fisher线性判别和支持向量机这两种机器学习算法对Oxford17Flower数据集实现植物分类的任务。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值