数据挖掘之LDA特征降维

1原理介绍

背景:线性判别分析(LDA)是数据预处理中的降维,分类任务,LDA是“有监督”的学习,与PCA不同,LDA更关心分类而不是方差。
思想:LDA分类的一个目标是使得不同类别之间的距离越远越好,同一类别之中的距离越近越好。
在这里插入图片描述
原理:投影到维度更低的空间中,使得投影后的点,会形成按类别区分,一簇一簇的情况,相同类别的点,将会在投影后的空间中更接近方法。
在这里插入图片描述 在这里插入图片描述

在这里插入图片描述

在这里插入图片描述

2手写代码实现LDA

import numpy as np
import pandas as pd
from scipy import linalg

def cal_mean(X,y):
    """
    计算不同类别的均值
    """
    mean_vectors = {}
    classs = np.unique(y)
    for i in list(classs):
        mean_vectors[i] =np.mean(X[y==i],axis=0)
    return mean_vectors


def cal_Sw(X,y,mean_vectors):
    """
    计算类内散步举证
    """
    n_classes = X.shape[1]
    Sw = np.zeros((n_classes, n_classes))
    for cl, mv in mean_vectors.items():
        class_sc_mat = (X[y == cl]-mean_vectors[cl]).T
        Sw+= class_sc_mat.dot(class_sc_mat.T)
    return Sw



def cal_Sb(X,y,cls_mean_vectors,overall_mean):
    """
    计算內间散布矩阵
    """
    Sb = np.zeros((X.shape[1],X.shape[1]))
    for i, mean_vec in cls_mean_vectors.items():
        n = X[y == i , :].shape[0]
        mean_vec = mean_vec.reshape(mean_vec.shape[0], 1)  # make column vector
        overall_mean = overall_mean.reshape(overall_mean.shape[0], 1)  # make column vector
        Sb += n * (mean_vec - overall_mean).dot((mean_vec - overall_mean).T)
    return Sb



def cal_eigVec_eigVal(Sw,Sb):
    """
    计算Sw^(-1)*Sb特征值和特征向量
    """
    # eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(Sw).dot(Sb))
    eig_vals, eig_vecs = linalg.eig(np.mat((np.linalg.inv(Sw).dot(Sb))))
    return eig_vals, eig_vecs

def cal_y(X,eig_pairs,n_compoments=1):
	"""
	降维计算
	"""
    W = []
    for val,vec in eig_pairs[:n_compoments]:
        W.append(vec)
    WT = np.array(W)
    y =WT.dot(X.T)
    return y.T

if __name__ == '__main__':
    feature_dict = {i: label for i, label in zip(
        range(4),
        ('sepal length in cm',
         'sepal width in cm',
         'petal length in cm',
         'petal width in cm',))}
    df = pd.io.parsers.read_csv(
        filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
        header=None,
        sep=',',)
    df.columns = [l for i, l in sorted(feature_dict.items())] + ['class label']
    df.dropna(how="all", inplace=True)  # to drop the empty line at file-end
    from sklearn.preprocessing import LabelEncoder

    X = df[['sepal length in cm', 'sepal width in cm', 'petal length in cm', 'petal width in cm']].values
    y = df['class label'].values

    enc = LabelEncoder()
    label_encoder = enc.fit(y)
    y = label_encoder.transform(y) + 1

    cls_mean_vectors=cal_mean(X,y)
    print(cls_mean_vectors)
    Sw=cal_Sw(X, y, cls_mean_vectors)
    print(Sw)
    overall_mean = np.mean(X,axis=0)
    Sb=cal_Sb(X,y,cls_mean_vectors,overall_mean)
    print(Sb)
    eig_vals, eig_vecs =cal_eigVec_eigVal(Sw=Sw,Sb=Sb)
    print(eig_vals)
    print(eig_vecs)
    eig_pairs =[(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
    eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)
    yt=cal_y(X,eig_pairs,2 )
    print(yt[:3,:])

输出结果值;
在这里插入图片描述

def plot_step_lda(X_lda):
    """
    画图
    """

    ax = plt.subplot(111)
    for label,marker,color in zip(
        range(1,4),('^', 's', 'o'),('blue', 'red', 'green')):

        plt.scatter(x=X_lda[:,0].real[y == label],
                y=X_lda[:,1].real[y == label],
                marker=marker,
                color=color,
                alpha=0.5,
                label=label_dict[label]
                )

    plt.xlabel('LD1')
    plt.ylabel('LD2')

    leg = plt.legend(loc='upper right', fancybox=True)
    leg.get_frame().set_alpha(0.5)
    plt.title('LDA: Iris projection onto the first 2 linear discriminants')

    # hide axis ticks
    plt.tick_params(axis="both", which="both", bottom="off", top="off",
            labelbottom="on", left="off", right="off", labelleft="on")

    # remove axis spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["left"].set_visible(False)

    plt.grid()
    plt.tight_layout
    plt.show()

在这里插入图片描述

3sklearn案列介绍


import pandas as pd
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis


def plot_scikit_lda(X, title):

    ax = plt.subplot(111)
    for label,marker,color in zip(
        range(1,4),('^', 's', 'o'),('blue', 'red', 'green')):

        plt.scatter(x=X[:,0][y == label],
                    y=X[:,1][y == label] * -1, # flip the figure
                    marker=marker,
                    color=color,
                    alpha=0.5,
                    label=label_dict[label])

    plt.xlabel('LD1')
    plt.ylabel('LD2')

    leg = plt.legend(loc='upper right', fancybox=True)
    leg.get_frame().set_alpha(0.5)
    plt.title(title)

    # hide axis ticks
    plt.tick_params(axis="both", which="both", bottom="off", top="off",
            labelbottom="on", left="off", right="off", labelleft="on")

    # remove axis spines
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    ax.spines["bottom"].set_visible(False)
    ax.spines["left"].set_visible(False)

    plt.grid()
    plt.tight_layout
    plt.show()


if __name__ == '__main__':
    # # LDA降维
    feature_dict = {i: label for i, label in zip(
        range(4),
        ('sepal length in cm',
         'sepal width in cm',
         'petal length in cm',
         'petal width in cm',))}
    df = pd.io.parsers.read_csv(
        filepath_or_buffer='https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data',
        header=None,
        sep=',',
    )
    df.columns = [l for i, l in sorted(feature_dict.items())] + ['class label']
    df.dropna(how="all", inplace=True)  # to drop the empty line at file-end

    X = df[['sepal length in cm', 'sepal width in cm', 'petal length in cm', 'petal width in cm']].values
    y = df['class label'].values

    enc = LabelEncoder()
    label_encoder = enc.fit(y)
    y = label_encoder.transform(y) + 1
    label_dict = {1: 'setosa', 2: 'versicolor', 3: "virginica"}

    clf = LinearDiscriminantAnalysis(n_components=2)
    X_d = clf.fit_transform(X, y)
    plot_scikit_lda(X_d, title='Default LDA via scikit-learn')


在这里插入图片描述

  • 1
    点赞
  • 14
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值