PCA的简要步骤:
- 数据中心化;
- 求中心化后数据的协方差矩阵;
- 求协方差矩阵的特征值、特征向量;
- 对特征值进行从大到小排序,同步排序特征向量;
- 从前往后选取一定百分比的排序后的特征向量组成映射矩阵;
- 将原始数据乘以映射矩阵得到维度降低后的数据。
C++实现PCA
auxiliary.hpp
#pragma once
#include <iostream>
#include <fstream>
#include <string>
#include <vector>
#include <Eigen/Dense>
#include <opencv2/highgui.hpp>
#include <opencv2/imgproc.hpp>
/* 解析鸢尾花数据集 */
std::vector<double> ParseStrLine(const std::string& line) {
std::vector<double> one_sample;
auto found = line.find_first_of(",");
double val = 0.0;
std::string substr = line;
while (found != std::string::npos)
{
val = atof(std::string(substr.begin(), substr.begin() + found).c_str());
one_sample.push_back(val);
substr = std::string(substr.begin() + found + 1, substr.end());
found = substr.find_first_of(",");
}
val = atof(substr.c_str());
one_sample.push_back(val);
return one_sample;
}
/* 读取鸢尾花数据集 */
int GetData(std::vector<std::vector<double> > &iris)
{
std::ifstream data_txt("../data/iris.txt");
if (!data_txt.is_open())
{
std::cout << "Read data failed." << std::endl;
system("pause");
return -1;
}
// iris中存储原始数据
std::string line("");
while (std::getline(data_txt, line))
{
iris.push_back(ParseStrLine(line));
}
return 0;
}
/* 读取鸢尾花数据集标签 */
int GetLabels(std::vector<int> &labels)
{
std::ifstream label_txt("../data/iris_label.txt");
if (!label_txt.is_open())
{
std::cout << "Read data failed." << std::endl;
system("pause");
return -1;
}
// iris中原始数据的标签
std::string line("");
std::getline(label_txt, line);
auto found = line.find_first_of(",");
std::string substr = line;
int val = 0;
while (found != std::string::npos)
{
val = atoi(std::string(substr.begin(), substr.begin() + found).c_str());
labels.push_back(val);
substr = std::string(substr.begin() + found + 1, substr.end());
found = substr.find_first_of(",");
}
val = atoi(substr.c_str());
labels.push_back(val);
return 0;
}
/* 保存PCA降维后的数据 */
int SaveDecompositionData(const Eigen::MatrixXd &pca_m)
{
std::ofstream pca_txt("../data/pca_iris.txt");
if (!pca_txt.is_open())
{
std::cout << "Read data failed." << std::endl;
system("pause");
return -1;
}
char content[30] = { '\0' };
for (int i = 0; i < pca_m.rows(); ++i)
{
sprintf_s(content, 30, "%.12f", pca_m(i, 0));
pca_txt << content << ", ";
sprintf_s(content, 30, "%.12f", pca_m(i, 1));
pca_txt << content << std::endl;
}
pca_txt.close();
return 0;
}
/* 可视化PCA降维后的数据 */
void Visialization(const Eigen::MatrixXd &pca_m, std::vector<int> &labels)
{
cv::Mat visual(220, 220, CV_8UC3);
cv::Scalar color[] = { cv::Scalar(255,0,0),cv::Scalar(0,255,0), cv::Scalar(0,0,255) };
for (int i = 0; i < pca_m.rows(); ++i)
{
cv::Point center(pca_m(i, 0) * 30 + 100, pca_m(i, 1) * 30 + 100);
visual.at<cv::Vec3b>(center.x, center.y) = cv::Vec3b(color[labels[i]][0], color[labels[i]][1], color[labels[i]][2]);
}
cv::imshow("visual", visual);
cv::waitKey(0);
}
pca.cpp
#include "auxiliary.hpp"
#define USE_LESS_EIGEN // 打开该宏尽可能的少用eigen库,更接近PCA的推导过程的实现
#define FEAT_REMAIN_RATIO 0.5 // 特征维度保留比例
#define EQUALIZATION_WITH_SKLEARN // 使计算结果与sklearn一致
int main()
{
/* Step0,读取鸢尾花数据 */
std::vector<std::vector<double> > iris;
int ret = GetData(iris);
if (0 != ret)
{
std::cout << "GetData() failed." << std::endl;
system("pause");
return ret;
}
/* Step1,读取鸢尾花数据的标签 */
std::vector<int> labels;
ret = GetLabels(labels);
if (0 != ret)
{
std::cout << "GetData() failed." << std::endl;
system("pause");
return ret;
}
/* Step2,将鸢尾花数据中心化 */
#ifdef USE_LESS_EIGEN
// 计算原始数据各维度均值
std::vector<double> mean(iris[0].size()); // 特征各维度均值
std::vector<double> sum(iris[0].size());
for (auto sample : iris)
{
for (int i = 0; i < sample.size(); ++i)
{
sum[i] += sample[i];
}
}
for (int i = 0; i < sum.size(); ++i)
{
mean[i] = sum[i] / iris.size();
}
// 计算中心化后的数据
Eigen::MatrixXd m(iris.size(), iris[0].size()); // 存储中心化后的数据
for (int i = 0; i < iris.size(); ++i)
{
for (int j = 0; j < iris[0].size(); ++j)
{
m(i, j) = iris[i][j] - mean[j];
//std::cout << m(i, j) << std::endl;
}
}
Eigen::MatrixXd zero_mean_m = m;
#else
Eigen::MatrixXd m(iris.size(), iris[0].size()); // 存储中心化后的数据
for (int i = 0; i < iris.size(); ++i)
{
for (int j = 0; j < iris[0].size(); ++j)
{
m(i, j) = iris[i][j];
}
}
Eigen::MatrixXd mean_vec = m.colwise().mean();
Eigen::RowVectorXd mean_vec_row(Eigen::RowVectorXd::Map(mean_vec.data(), m.cols()));
Eigen::MatrixXd zero_mean_m = m;
zero_mean_m.rowwise() -= mean_vec_row;
#endif
/* Step3,计算中心化后数据zero_mean_m的协方差矩阵 */
#ifdef USE_LESS_EIGEN
Eigen::MatrixXd zero_mean_transpose = zero_mean_m.transpose();
Eigen::MatrixXd covariance = zero_mean_transpose * zero_mean_m / (zero_mean_m.rows() - 1);
std::cout << covariance << std::endl;
#else
Eigen::MatrixXd covariance = (zero_mean_m.adjoint()*zero_mean_m) / double(zero_mean_m.rows() - 1);
std::cout << covariance << std::endl;
#endif
/* Step4,计算协方差矩阵的特征值及特征向量 */
Eigen::EigenSolver<Eigen::MatrixXd>eigen_solver(covariance);
if (eigen_solver.info() != Eigen::Success)
{
std::cout << "Solver error." << std::endl;
}
std::cout << "特征值:\n" << eigen_solver.eigenvalues().real() << std::endl;
std::cout << "特征向量:\n" << eigen_solver.eigenvectors().real() << std::endl;
/* Step5,将特征值从大到小排序,并同步排序特征向量 */
std::vector<std::pair<double, std::vector<double>>>eigen_values_vectors;
for (int i = 0; i < covariance.rows(); ++i)
{
std::pair<double, std::vector<double>> eigen_value_vector;
eigen_value_vector.first = eigen_solver.eigenvalues().real()[i];
for (int j = 0; j < covariance.cols(); ++j)
{
// 注意eigen中特征向量是按列排布,因此下面取特征向量的索引是(j, i)
eigen_value_vector.second.push_back(eigen_solver.eigenvectors()(j, i).real());
}
eigen_values_vectors.push_back(eigen_value_vector);
}
struct CEigenBig {
bool operator() (std::pair<double, std::vector<double>> i, std::pair<double, std::vector<double>> j) { return (i.first > j.first); }
}eigen_big;
std::sort(eigen_values_vectors.begin(), eigen_values_vectors.end(), eigen_big); // 将特征值从大到小排序,并同步排序特征向量
int remain = eigen_values_vectors.size() * FEAT_REMAIN_RATIO; // 计算保留的特征维度数
Eigen::MatrixXd components(eigen_values_vectors.size(), remain); // components是保留维度的特征向量
for (int j = 0; j < components.cols(); ++j)
{
for (int i = 0; i < components.rows(); ++i)
{
// 该语句块可使最终降维后数据与sklearn中的完全一致,
// 由于实现细节的差异使得特征向量的正负号有差异导致降维后数据存在正负号的差异,
// 对于鸢尾花数据sklearn的实现与此处的实现其第二大特征值对应的特征向量存在正负号的差异,
// 因此想要降维后的数据第二各维度的符号一致可以将第二大特征值取反
#ifdef EQUALIZATION_WITH_SKLEARN
if (1 == j)
{
components(i, j) = -eigen_values_vectors[j].second[i];
}
else
{
components(i, j) = eigen_values_vectors[j].second[i];
}
#else
components(i, j) = eigen_values_vectors[j].second[i];
#endif
}
}
std::cout << components << std::endl;
/* Step6,将中心化的数据乘以(保留维度的)特征向量得到降维后的数据 */
Eigen::MatrixXd pca_m = zero_mean_m * components;
std::cout << "降维后数据:\n" << pca_m << std::endl;
/* Step7,保存降维后数据 */
ret = SaveDecompositionData(pca_m);
if (0 != ret)
{
std::cout << "GetData() failed." << std::endl;
system("pause");
return ret;
}
/* Step8,可视化降维后数据 */
Visialization(pca_m, labels);
system("pause");
return 0;
}
sklearn实现PCA
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.decomposition import PCA
import numpy
iris = datasets.load_iris()
x = iris.data
y = iris.target
target_names = iris.target_names
pca = PCA(n_components=2)
X_r = pca.fit(x).transform(x)
cov = pca.get_covariance()
# X_r = pca.fit_transform(x)
print("explained variance ratio (first two components): %s" % str(pca.explained_variance_ratio_))
#fout = open(r'..\data\sklearn_pca_iris.txt', 'w',encoding='UTF-8')
#for i in range(len(X_r)):
# fout.write(str(X_r[i][0])+', '+str(X_r[i][1])+'\n')
#fout.close()
numpy.savetxt(r'..\data\sklearn_pca_iris.txt', X_r, fmt='%.12f', delimiter=', ', newline='\n')
plt.figure()
colors = ["navy", "turquoise", "darkorange"]
lw = 2
for color, i, target_name in zip(colors, [0, 1, 2], target_names):
plt.scatter(
X_r[y == i, 0], X_r[y == i, 1], color=color, alpha=0.8, lw=lw, label=target_name
)
plt.legend(loc="best", shadow=False, scatterpoints=1)
plt.title("PCA of IRIS dataset")
plt.show()
Data
iris.txt
5.1,3.5,1.4,0.2
4.9,3. ,1.4,0.2
4.7,3.2,1.3,0.2
4.6,3.1,1.5,0.2
5. ,3.6,1.4,0.2
5.4,3.9,1.7,0.4
4.6,3.4,1.4,0.3
5. ,3.4,1.5,0.2
4.4,2.9,1.4,0.2
4.9,3.1,1.5,0.1
5.4,3.7,1.5,0.2
4.8,3.4,1.6,0.2
4.8,3. ,1.4,0.1
4.3,3. ,1.1,0.1
5.8,4. ,1.2,0.2
5.7,4.4,1.5,0.4
5.4,3.9,1.3,0.4
5.1,3.5,1.4,0.3
5.7,3.8,1.7,0.3
5.1,3.8,1.5,0.3
5.4,3.4,1.7,0.2
5.1,3.7,1.5,0.4
4.6,3.6,1. ,0.2
5.1,3.3,1.7,0.5
4.8,3.4,1.9,0.2
5. ,3. ,1.6,0.2
5. ,3.4,1.6,0.4
5.2,3.5,1.5,0.2
5.2,3.4,1.4,0.2
4.7,3.2,1.6,0.2
4.8,3.1,1.6,0.2
5.4,3.4,1.5,0.4
5.2,4.1,1.5,0.1
5.5,4.2,1.4,0.2
4.9,3.1,1.5,0.1
5. ,3.2,1.2,0.2
5.5,3.5,1.3,0.2
4.9,3.1,1.5,0.1
4.4,3. ,1.3,0.2
5.1,3.4,1.5,0.2
5. ,3.5,1.3,0.3
4.5,2.3,1.3,0.3
4.4,3.2,1.3,0.2
5. ,3.5,1.6,0.6
5.1,3.8,1.9,0.4
4.8,3. ,1.4,0.3
5.1,3.8,1.6,0.2
4.6,3.2,1.4,0.2
5.3,3.7,1.5,0.2
5. ,3.3,1.4,0.2
7. ,3.2,4.7,1.4
6.4,3.2,4.5,1.5
6.9,3.1,4.9,1.5
5.5,2.3,4. ,1.3
6.5,2.8,4.6,1.5
5.7,2.8,4.5,1.3
6.3,3.3,4.7,1.6
4.9,2.4,3.3,1.
6.6,2.9,4.6,1.3
5.2,2.7,3.9,1.4
5. ,2. ,3.5,1.
5.9,3. ,4.2,1.5
6. ,2.2,4. ,1.
6.1,2.9,4.7,1.4
5.6,2.9,3.6,1.3
6.7,3.1,4.4,1.4
5.6,3. ,4.5,1.5
5.8,2.7,4.1,1.
6.2,2.2,4.5,1.5
5.6,2.5,3.9,1.1
5.9,3.2,4.8,1.8
6.1,2.8,4. ,1.3
6.3,2.5,4.9,1.5
6.1,2.8,4.7,1.2
6.4,2.9,4.3,1.3
6.6,3. ,4.4,1.4
6.8,2.8,4.8,1.4
6.7,3. ,5. ,1.7
6. ,2.9,4.5,1.5
5.7,2.6,3.5,1.
5.5,2.4,3.8,1.1
5.5,2.4,3.7,1.
5.8,2.7,3.9,1.2
6. ,2.7,5.1,1.6
5.4,3. ,4.5,1.5
6. ,3.4,4.5,1.6
6.7,3.1,4.7,1.5
6.3,2.3,4.4,1.3
5.6,3. ,4.1,1.3
5.5,2.5,4. ,1.3
5.5,2.6,4.4,1.2
6.1,3. ,4.6,1.4
5.8,2.6,4. ,1.2
5. ,2.3,3.3,1.
5.6,2.7,4.2,1.3
5.7,3. ,4.2,1.2
5.7,2.9,4.2,1.3
6.2,2.9,4.3,1.3
5.1,2.5,3. ,1.1
5.7,2.8,4.1,1.3
6.3,3.3,6. ,2.5
5.8,2.7,5.1,1.9
7.1,3. ,5.9,2.1
6.3,2.9,5.6,1.8
6.5,3. ,5.8,2.2
7.6,3. ,6.6,2.1
4.9,2.5,4.5,1.7
7.3,2.9,6.3,1.8
6.7,2.5,5.8,1.8
7.2,3.6,6.1,2.5
6.5,3.2,5.1,2.
6.4,2.7,5.3,1.9
6.8,3. ,5.5,2.1
5.7,2.5,5. ,2.
5.8,2.8,5.1,2.4
6.4,3.2,5.3,2.3
6.5,3. ,5.5,1.8
7.7,3.8,6.7,2.2
7.7,2.6,6.9,2.3
6. ,2.2,5. ,1.5
6.9,3.2,5.7,2.3
5.6,2.8,4.9,2.
7.7,2.8,6.7,2.
6.3,2.7,4.9,1.8
6.7,3.3,5.7,2.1
7.2,3.2,6. ,1.8
6.2,2.8,4.8,1.8
6.1,3. ,4.9,1.8
6.4,2.8,5.6,2.1
7.2,3. ,5.8,1.6
7.4,2.8,6.1,1.9
7.9,3.8,6.4,2.
6.4,2.8,5.6,2.2
6.3,2.8,5.1,1.5
6.1,2.6,5.6,1.4
7.7,3. ,6.1,2.3
6.3,3.4,5.6,2.4
6.4,3.1,5.5,1.8
6. ,3. ,4.8,1.8
6.9,3.1,5.4,2.1
6.7,3.1,5.6,2.4
6.9,3.1,5.1,2.3
5.8,2.7,5.1,1.9
6.8,3.2,5.9,2.3
6.7,3.3,5.7,2.5
6.7,3. ,5.2,2.3
6.3,2.5,5. ,1.9
6.5,3. ,5.2,2.
6.2,3.4,5.4,2.3
5.9,3. ,5.1,1.8
iris_labels.txt
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2