PCA过程:
- 零均值化
- 求协方差
- 求特征值及特征向量
- 按特征值大小排序特征向量
- 取前k行组成变换矩阵
- 使用变换矩阵即可进行降维或还原
测试数据集(68040*32):CorelFeatures-mld/ColorHistogram.asc
#include <iostream>
#include <math.h>
#include <string.h>
#include <string>
#include <vector>
#include <map>
#include <fstream>
#include <iomanip>
using namespace std;
/**
* @brief 计算协方差矩阵
* @param mat m*n 输入矩阵
* @param covar n*n 协方差矩阵
* @param mean 列均值向量
* @param scale 是否缩放
* @return
*/
template<typename _Tp>
int calcCovarMatrix(const std::vector<std::vector<_Tp>>& mat, std::vector<std::vector<_Tp>>& covar, std::vector<_Tp>& mean, bool scale = false)
{
const int rows = mat.size();
const int cols = mat[0].size();
const int nsamples = rows;
double scale_ = 1.;
if (scale) scale_ = 1. / (nsamples /*- 1*/);
covar.resize(cols);
for (int i = 0; i < cols; ++i)
covar[i].resize(cols, (_Tp)0);
mean.resize(cols, (_Tp)0);
for (int w = 0; w < cols; ++w) {
for (int h = 0; h < rows; ++h) {
mean[w] += mat[h][w];
}
}
for (auto& value : mean) {
value = 1. / rows * value;
}
for (int i = 0; i < cols; ++i) {
std::vector<_Tp> col_buf(rows, (_Tp)0);
for (int k = 0; k < rows; ++k)
col_buf[k] = mat[k][i] - mean[i];
for (int j = 0; j < cols; ++j) {
double s0 = 0;
for (int k = 0; k < rows; ++k) {
s0 += col_buf[k] * (mat[k][j] - mean[j]);
}
covar[i][j] = (_Tp)(s0 * scale_);
}
}
return 0;
}
/**
* @brief 求实对称矩阵的特征值及特征向量的雅克比法
* 利用雅格比(Jacobi)方法求实对称矩阵的全部特征值及特征向量
* @param pMatrix n*n 存放实对称矩阵
* @param pdblVects n*n 返回特征向量(按列存储)
* @param pdbEigenValues 特征值数组
* @param dbEps 精度
* @param nJt 最大迭代次数
* @return
*/
int JacbiCor(const vector<vector<double> >& mat, vector<vector<double> >& vects, vector<double>& values, double dbEps, int nJt)
{
int nDim = mat.size();
double pMatrix[nDim*nDim];
for(int i=0; i<nDim; i++){
for(int j=0; j<nDim; j++){
pMatrix[i*nDim+j] = mat[i][j];
}
}
double* pdblVects = new double[nDim*nDim];
double* pdbEigenValues = new double[nDim];
for(int i = 0; i < nDim; i ++)
{
pdblVects[i*nDim+i] = 1.0f;
for(int j = 0; j < nDim; j ++)
{
if(i != j)
pdblVects[i*nDim+j]=0.0f;
}
}
int nCount = 0; //迭代次数
while(1)
{
//在pMatrix的非对角线上找到最大元素
double dbMax = pMatrix[1];
int nRow = 0;
int nCol = 1;
for (int i = 0; i < nDim; i ++) //行
{
for (int j = 0; j < nDim; j ++) //列
{
double d = fabs(pMatrix[i*nDim+j]);
if((i!=j) && (d> dbMax))
{
dbMax = d;
nRow = i;
nCol = j;
}
}
}
if(dbMax < dbEps) //精度符合要求
break;
if(nCount > nJt) //迭代次数超过限制
break;
nCount++;
double dbApp = pMatrix[nRow*nDim+nRow];
double dbApq = pMatrix[nRow*nDim+nCol];
double dbAqq = pMatrix[nCol*nDim+nCol];
//计算旋转角度
double dbAngle = 0.5*atan2(-2*dbApq,dbAqq-dbApp);
double dbSinTheta = sin(dbAngle);
double dbCosTheta = cos(dbAngle);
double dbSin2Theta = sin(2*dbAngle);
double dbCos2Theta = cos(2*dbAngle);
pMatrix[nRow*nDim+nRow] = dbApp*dbCosTheta*dbCosTheta +
dbAqq*dbSinTheta*dbSinTheta + 2*dbApq*dbCosTheta*dbSinTheta;
pMatrix[nCol*nDim+nCol] = dbApp*dbSinTheta*dbSinTheta +
dbAqq*dbCosTheta*dbCosTheta - 2*dbApq*dbCosTheta*dbSinTheta;
pMatrix[nRow*nDim+nCol] = 0.5*(dbAqq-dbApp)*dbSin2Theta + dbApq*dbCos2Theta;
pMatrix[nCol*nDim+nRow] = pMatrix[nRow*nDim+nCol];
for(int i = 0; i < nDim; i ++)
{
if((i!=nCol) && (i!=nRow))
{
int u = i*nDim + nRow; //p
int w = i*nDim + nCol; //q
dbMax = pMatrix[u];
pMatrix[u]= pMatrix[w]*dbSinTheta + dbMax*dbCosTheta;
pMatrix[w]= pMatrix[w]*dbCosTheta - dbMax*dbSinTheta;
}
}
for (int j = 0; j < nDim; j ++)
{
if((j!=nCol) && (j!=nRow))
{
int u = nRow*nDim + j; //p
int w = nCol*nDim + j; //q
dbMax = pMatrix[u];
pMatrix[u]= pMatrix[w]*dbSinTheta + dbMax*dbCosTheta;
pMatrix[w]= pMatrix[w]*dbCosTheta - dbMax*dbSinTheta;
}
}
//计算特征向量
for(int i = 0; i < nDim; i ++)
{
int u = i*nDim + nRow; //p
int w = i*nDim + nCol; //q
dbMax = pdblVects[u];
pdblVects[u] = pdblVects[w]*dbSinTheta + dbMax*dbCosTheta;
pdblVects[w] = pdblVects[w]*dbCosTheta - dbMax*dbSinTheta;
}
}
//对特征值进行排序以及重新排列特征向量,特征值即pMatrix主对角线上的元素
std::map<double,int> mapEigen;
for(int i = 0; i < nDim; i ++)
{
pdbEigenValues[i] = pMatrix[i*nDim+i];
mapEigen.insert(make_pair( pdbEigenValues[i],i ) );
}
double *pdbTmpVec = new double[nDim*nDim];
std::map<double,int>::reverse_iterator iter = mapEigen.rbegin();
for (int j = 0; iter != mapEigen.rend(),j < nDim; ++iter,++j)
{
for (int i = 0; i < nDim; i ++)
{
pdbTmpVec[i*nDim+j] = pdblVects[i*nDim + iter->second];
}
//特征值重新排列
pdbEigenValues[j] = iter->first;
}
//设定正负号
for(int i = 0; i < nDim; i ++)
{
double dSumVec = 0;
for(int j = 0; j < nDim; j ++)
dSumVec += pdbTmpVec[j * nDim + i];
if(dSumVec<0)
{
for(int j = 0;j < nDim; j ++)
pdbTmpVec[j * nDim + i] *= -1;
}
}
memcpy(pdblVects,pdbTmpVec,sizeof(double)*nDim*nDim);
for(int i=0; i<nDim; i++){
values[i] = pdbEigenValues[i];
for(int j=0; j<nDim; j++){
vects[i][j] = pdblVects[i*nDim+j];
}
}
delete []pdbTmpVec;
delete []pdblVects;
delete []pdbEigenValues;
return 0;
}
/**
* @brief 矩阵乘法
* 须满足 a的列数 == b的行数
* @param a m*k 矩阵
* @param b k*n 矩阵
* @param result m*n 矩阵 乘法结果
* @return
*/
int mul(const vector<vector<double> >& a, const vector<vector<double> >& b, vector<vector<double> >& result){
const int m = a.size();
const int n = a[0].size();
const int k = b[0].size();
//cout<<"mul: "<<a.size()<<"*"<<a[0].size()<<" "<<b.size()<<"*"<<b[0].size()<<endl;
for (int i = 0; i < m; i++) //a的行数
{
vector<double> t;
for (int j = 0; j < k; j++) //b的列数
{
double sum = 0;
for (int w = 0; w < n; w++) //a的列数
{
sum += a[i][w] * b[w][j];
}
t.push_back(sum);
}
result.push_back(t);
}
return 0;
}
/**
* @brief 向量方差
* 须满足 a b 维数相等
* @param a n维向量
* @param b n维向量
* @param result 方差
* @return
*/
int variance(vector<double> a, vector<double> b, double& result){
result = 0;
const int n = a.size();
for(int i=0; i<n; i++){
result += (a[i]-b[i])*(a[i]-b[i]);
}
result /= n;
return 0;
}
const int m = 68040; //行,向量数
const int n = 32; //列,维数
double kee = 0.95; //能量阈值,计算最小满足的k 也可直接指定k
int main(){
fstream fs("ColorHistogram.asc");
vector<vector<double> > data; //原始数据
vector<double> mean;//均值
vector<vector<double> > covar;//协方差
for(int i=0; i<m; i++){
vector<double> line;
for(int j=0; j<=n; j++){
double t;
fs>>t;
if(j != 0){
line.push_back(t);
}
}
data.push_back(line);
}
fs.close();
cout<<"read data over."<<endl;
calcCovarMatrix(data, covar, mean);
cout<<"calcCovarMatrix over."<<endl;
vector<vector<double> > pdblVects;
pdblVects.resize(n);
for(int i=0; i<n; i++) pdblVects[i].resize(n);
vector<double> pdbEigenValues;
pdbEigenValues.resize(n);
JacbiCor(covar, pdblVects, pdbEigenValues, 0.1, 1000);
cout<<"JacbiCor over."<<endl;
cout<<"eigenvalue: "<<endl;
double sum_pdbEigenValues = 0;
for(auto v : pdbEigenValues){
sum_pdbEigenValues += v;
cout<<v<<"\t";
}
cout<<endl;
//计算最小满足kee的k值
int k = 0;
double t_sum = 0;
for(; k<n; k++){
t_sum += pdbEigenValues[k];
cout<<k<<" "<<t_sum/sum_pdbEigenValues<<endl;
if(t_sum/sum_pdbEigenValues > kee){
break;
}
}
k++;
//k = 32;
cout<<"k: "<<k<<endl;
//旋转以用于矩阵乘法
vector<vector<double> > rotate_pdblVects;
rotate_pdblVects.resize(n);
for(int i=0; i<k; i++){
for(int j=0; j<n; j++){
rotate_pdblVects[j].push_back(pdblVects[i][j]);
}
}
//降维
vector<vector<double> > z;
mul(data, rotate_pdblVects, z);
cout<<"to k-dim over."<<endl;
//还原
vector<vector<double> > xap;
mul(z, pdblVects, xap);
cout<<"back n-dim over."<<endl;
//与原始数据进行对比
cout.setf(ios::fixed);
for(int i=0; i<15; i++){
cout<<fixed<<setprecision(4)<<data[0][i]<<"\t";
}
cout<<endl;
for(int i=0; i<15; i++){
cout<<fixed<<setprecision(4)<<xap[0][i]<<"\t";
}
cout<<endl;
//计算与原始数据的标准差
cout<<"variance: "<<endl;
for(int i=0; i<10; i++){
double vari;
variance(data[i], xap[i], vari);
cout<<i<<":\t"<<sqrt(vari)<<endl;
}
return 0;
}