# -*- coding: utf-8 -*-
# author: Xin Chen
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# 多分类通用:
# data 的格式应该是DataFrame的格式,其中前几列为特征,最后一列为标签
# label从0开始取得数字
def calWeights(data, K):
labels = np.unique(data['label'])
cNum = len(labels)
if K >= cNum:
print 'K is too large, please input again'
exit(0)
# Sw 类内散度矩阵
dim = np.shape(data)[1]-1 # 减1是因为有label
Sw = np.mat(np.zeros((dim, dim)))
for i in labels:
datai = data.ix[data['label'] == i]
Ci = np.mat(datai.drop('label', axis=1))
ui = np.mean(Ci, axis=0)
Si = (Ci-ui).T * (Ci-ui)
Sw += Si
# St 全局散度矩阵
C = np.mat(data.drop('label', axis=1))
u = np.mean(C, axis=0)
St = (C-u).T *(C-u)
# Sb 类间散度矩阵
Sb = St - Sw
S = Sw.I * Sb
eigValues, eigVectors = np.linalg.eig(S) # 用来求特征值和特征向量
order= eigValues.argsort()[::-1]
select = order[:K]
Weights = eigVectors[:, select]
return Weights
def LDAplot(data):
# 也要注意这个函数画的是投影点而不是原始的点
w = calWeights(data, K=2)
if np.shape(w)[1] != 2:
print '维度太大,无法画图!'
return
labels = np.unique(data['label'])
C = np.mat(data.drop('label', axis=1))
CC = C * w
corlist = ['r', 'y', 'g', 'b', 'w']
for i in labels:
CCi = CC[:, data['label'] == i]
plt.scatter(CCi[:, 0], CCi[:, 1], c=corlist[i], marker="o")
plt.show()
from sklearn import datasets
iris = datasets.load_iris()
X = iris.data
Y = iris.target
data = pd.DataFrame(X)
data['label'] = Y
Weights = calWeights(data, K=2)
print "opticalweights=", Weights
LDAplot(data)