PCA算法cpp实现

PCA过程:

  • 零均值化
  • 求协方差
  • 求特征值及特征向量
  • 按特征值大小排序特征向量
  • 取前k行组成变换矩阵
  • 使用变换矩阵即可进行降维或还原

测试数据集(68040*32):CorelFeatures-mld/ColorHistogram.asc

#include <iostream>
#include <math.h>
#include <string.h>
#include <string>
#include <vector>
#include <map>
#include <fstream>
#include <iomanip>

using namespace std;

/**
* @brief 计算协方差矩阵
* @param mat    m*n 输入矩阵
* @param covar  n*n 协方差矩阵
* @param mean   列均值向量
* @param scale  是否缩放
* @return
*/
template<typename _Tp>
int calcCovarMatrix(const std::vector<std::vector<_Tp>>& mat, std::vector<std::vector<_Tp>>& covar, std::vector<_Tp>& mean, bool scale = false)
{
    const int rows = mat.size();
    const int cols = mat[0].size();
    const int nsamples = rows;
    double scale_ = 1.;
    if (scale) scale_ = 1. / (nsamples /*- 1*/);

    covar.resize(cols);
    for (int i = 0; i < cols; ++i)
        covar[i].resize(cols, (_Tp)0);
    mean.resize(cols, (_Tp)0);

    for (int w = 0; w < cols; ++w) {
        for (int h = 0; h < rows; ++h) {
            mean[w] += mat[h][w];
        }
    }

    for (auto& value : mean) {
        value = 1. / rows * value;
    }

    for (int i = 0; i < cols; ++i) {
        std::vector<_Tp> col_buf(rows, (_Tp)0);
        for (int k = 0; k < rows; ++k)
            col_buf[k] = mat[k][i] - mean[i];

        for (int j = 0; j < cols; ++j) {
            double s0 = 0;
            for (int k = 0; k < rows; ++k) {
                s0 += col_buf[k] * (mat[k][j] - mean[j]);
            }
            covar[i][j] = (_Tp)(s0 * scale_);
        }
    }

    return 0;
}

/**
* @brief 求实对称矩阵的特征值及特征向量的雅克比法
* 利用雅格比(Jacobi)方法求实对称矩阵的全部特征值及特征向量
* @param pMatrix                n*n 存放实对称矩阵
* @param pdblVects              n*n 返回特征向量(按列存储)
* @param pdbEigenValues         特征值数组
* @param dbEps                  精度
* @param nJt                    最大迭代次数
* @return
*/
int JacbiCor(const vector<vector<double> >& mat, vector<vector<double> >& vects, vector<double>& values, double dbEps, int nJt)
{
    int nDim = mat.size();
    double pMatrix[nDim*nDim];
    for(int i=0; i<nDim; i++){
        for(int j=0; j<nDim; j++){
            pMatrix[i*nDim+j] = mat[i][j];
        }
    }
    double* pdblVects = new double[nDim*nDim];
    double* pdbEigenValues = new double[nDim];
    for(int i = 0; i < nDim; i ++)
    {
        pdblVects[i*nDim+i] = 1.0f;
        for(int j = 0; j < nDim; j ++)
        {
            if(i != j)
                pdblVects[i*nDim+j]=0.0f;
        }
    }

    int nCount = 0;     //迭代次数
    while(1)
    {
        //在pMatrix的非对角线上找到最大元素
        double dbMax = pMatrix[1];
        int nRow = 0;
        int nCol = 1;
        for (int i = 0; i < nDim; i ++)          //行
        {
            for (int j = 0; j < nDim; j ++)      //列
            {
                double d = fabs(pMatrix[i*nDim+j]);

                if((i!=j) && (d> dbMax))
                {
                    dbMax = d;
                    nRow = i;
                    nCol = j;
                }
            }
        }

        if(dbMax < dbEps)     //精度符合要求
            break;

        if(nCount > nJt)       //迭代次数超过限制
            break;

        nCount++;

        double dbApp = pMatrix[nRow*nDim+nRow];
        double dbApq = pMatrix[nRow*nDim+nCol];
        double dbAqq = pMatrix[nCol*nDim+nCol];

        //计算旋转角度
        double dbAngle = 0.5*atan2(-2*dbApq,dbAqq-dbApp);
        double dbSinTheta = sin(dbAngle);
        double dbCosTheta = cos(dbAngle);
        double dbSin2Theta = sin(2*dbAngle);
        double dbCos2Theta = cos(2*dbAngle);

        pMatrix[nRow*nDim+nRow] = dbApp*dbCosTheta*dbCosTheta +
            dbAqq*dbSinTheta*dbSinTheta + 2*dbApq*dbCosTheta*dbSinTheta;
        pMatrix[nCol*nDim+nCol] = dbApp*dbSinTheta*dbSinTheta +
            dbAqq*dbCosTheta*dbCosTheta - 2*dbApq*dbCosTheta*dbSinTheta;
        pMatrix[nRow*nDim+nCol] = 0.5*(dbAqq-dbApp)*dbSin2Theta + dbApq*dbCos2Theta;
        pMatrix[nCol*nDim+nRow] = pMatrix[nRow*nDim+nCol];

        for(int i = 0; i < nDim; i ++)
        {
            if((i!=nCol) && (i!=nRow))
            {
                int u = i*nDim + nRow;  //p
                int w = i*nDim + nCol;  //q
                dbMax = pMatrix[u];
                pMatrix[u]= pMatrix[w]*dbSinTheta + dbMax*dbCosTheta;
                pMatrix[w]= pMatrix[w]*dbCosTheta - dbMax*dbSinTheta;
            }
        }

        for (int j = 0; j < nDim; j ++)
        {
            if((j!=nCol) && (j!=nRow))
            {
                int u = nRow*nDim + j;  //p
                int w = nCol*nDim + j;  //q
                dbMax = pMatrix[u];
                pMatrix[u]= pMatrix[w]*dbSinTheta + dbMax*dbCosTheta;
                pMatrix[w]= pMatrix[w]*dbCosTheta - dbMax*dbSinTheta;
            }
        }

        //计算特征向量
        for(int i = 0; i < nDim; i ++)
        {
            int u = i*nDim + nRow;      //p
            int w = i*nDim + nCol;      //q
            dbMax = pdblVects[u];
            pdblVects[u] = pdblVects[w]*dbSinTheta + dbMax*dbCosTheta;
            pdblVects[w] = pdblVects[w]*dbCosTheta - dbMax*dbSinTheta;
        }

    }

    //对特征值进行排序以及重新排列特征向量,特征值即pMatrix主对角线上的元素
    std::map<double,int> mapEigen;
    for(int i = 0; i < nDim; i ++)
    {
        pdbEigenValues[i] = pMatrix[i*nDim+i];

        mapEigen.insert(make_pair( pdbEigenValues[i],i ) );
    }

    double *pdbTmpVec = new double[nDim*nDim];
    std::map<double,int>::reverse_iterator iter = mapEigen.rbegin();
    for (int j = 0; iter != mapEigen.rend(),j < nDim; ++iter,++j)
    {
        for (int i = 0; i < nDim; i ++)
        {
            pdbTmpVec[i*nDim+j] = pdblVects[i*nDim + iter->second];
        }

        //特征值重新排列
        pdbEigenValues[j] = iter->first;
    }

    //设定正负号
    for(int i = 0; i < nDim; i ++)
    {
        double dSumVec = 0;
        for(int j = 0; j < nDim; j ++)
            dSumVec += pdbTmpVec[j * nDim + i];
        if(dSumVec<0)
        {
            for(int j = 0;j < nDim; j ++)
                pdbTmpVec[j * nDim + i] *= -1;
        }
    }

    memcpy(pdblVects,pdbTmpVec,sizeof(double)*nDim*nDim);
    for(int i=0; i<nDim; i++){
        values[i] = pdbEigenValues[i];
        for(int j=0; j<nDim; j++){
            vects[i][j] = pdblVects[i*nDim+j];
        }
    }
    delete []pdbTmpVec;
    delete []pdblVects;
    delete []pdbEigenValues;

    return 0;
}

/**
* @brief 矩阵乘法
* 须满足 a的列数 == b的行数
* @param a      m*k 矩阵
* @param b      k*n 矩阵
* @param result m*n 矩阵 乘法结果
* @return
*/
int mul(const vector<vector<double> >& a, const vector<vector<double> >& b, vector<vector<double> >& result){
    const int m = a.size();
    const int n = a[0].size();
    const int k = b[0].size();
    //cout<<"mul: "<<a.size()<<"*"<<a[0].size()<<" "<<b.size()<<"*"<<b[0].size()<<endl;
    for (int i = 0; i < m; i++)  //a的行数
    {
        vector<double> t;
        for (int j = 0; j < k; j++)      //b的列数
        {
            double sum = 0;
            for (int w = 0; w < n; w++)  //a的列数
            {
                sum += a[i][w] * b[w][j];
            }
            t.push_back(sum);
        }
        result.push_back(t);
    }
    return 0;
}

/**
* @brief 向量方差
* 须满足 a b 维数相等
* @param a      n维向量
* @param b      n维向量
* @param result 方差
* @return
*/
int variance(vector<double> a, vector<double> b, double& result){
    result = 0;
    const int n = a.size();
    for(int i=0; i<n; i++){
        result += (a[i]-b[i])*(a[i]-b[i]);
    }
    result /= n;
    return 0;
}

const int m = 68040;    //行,向量数
const int n = 32;       //列,维数

double kee = 0.95;      //能量阈值,计算最小满足的k 也可直接指定k

int main(){
    fstream fs("ColorHistogram.asc");

    vector<vector<double> > data; //原始数据
    vector<double> mean;//均值
    vector<vector<double> > covar;//协方差

    for(int i=0; i<m; i++){
        vector<double> line;
        for(int j=0; j<=n; j++){
            double t;
            fs>>t;
            if(j != 0){
                line.push_back(t);
            }
        }
        data.push_back(line);
    }
    fs.close();
    cout<<"read data over."<<endl;

    calcCovarMatrix(data, covar, mean);
    cout<<"calcCovarMatrix over."<<endl;

    vector<vector<double> > pdblVects;
    pdblVects.resize(n);
    for(int i=0; i<n; i++) pdblVects[i].resize(n);
    vector<double> pdbEigenValues;
    pdbEigenValues.resize(n);

    JacbiCor(covar, pdblVects, pdbEigenValues, 0.1, 1000);
    cout<<"JacbiCor over."<<endl;

    cout<<"eigenvalue: "<<endl;
    double sum_pdbEigenValues = 0;
    for(auto v : pdbEigenValues){
        sum_pdbEigenValues += v;
        cout<<v<<"\t";
    }
    cout<<endl;

    //计算最小满足kee的k值
    int k = 0;
    double t_sum = 0;
    for(; k<n; k++){
        t_sum += pdbEigenValues[k];
        cout<<k<<" "<<t_sum/sum_pdbEigenValues<<endl;
        if(t_sum/sum_pdbEigenValues > kee){
            break;
        }
    }
    k++;
    //k = 32;
    cout<<"k: "<<k<<endl;

    //旋转以用于矩阵乘法
    vector<vector<double> > rotate_pdblVects;
    rotate_pdblVects.resize(n);
    for(int i=0; i<k; i++){
        for(int j=0; j<n; j++){
            rotate_pdblVects[j].push_back(pdblVects[i][j]);
        }
    }
    //降维
    vector<vector<double> > z;
    mul(data, rotate_pdblVects, z);
    cout<<"to k-dim over."<<endl;
    //还原
    vector<vector<double> > xap;
    mul(z, pdblVects, xap);
    cout<<"back n-dim over."<<endl;
    //与原始数据进行对比
    cout.setf(ios::fixed);
    for(int i=0; i<15; i++){
        cout<<fixed<<setprecision(4)<<data[0][i]<<"\t";
    }
    cout<<endl;
    for(int i=0; i<15; i++){
        cout<<fixed<<setprecision(4)<<xap[0][i]<<"\t";
    }
    cout<<endl;
    //计算与原始数据的标准差
    cout<<"variance: "<<endl;
    for(int i=0; i<10; i++){
        double vari;
        variance(data[i], xap[i], vari);
        cout<<i<<":\t"<<sqrt(vari)<<endl;
    }
    return 0;
}

  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值