PCA降维（主成分分析）处理训练集后，线上正式数据应该如何处理？

最新推荐文章于 2024-07-20 17:25:24 发布

TURING.DT

最新推荐文章于 2024-07-20 17:25:24 发布

阅读量9.1k

点赞数 13

分类专栏：机器学习/数据挖掘

本文链接：https://blog.csdn.net/levy_cui/article/details/102670611

版权

机器学习/数据挖掘专栏收录该内容

81 篇文章 4 订阅

订阅专栏

训练数据集在使用PCA进行数据降维后，用基本分类器进行训练得到一个分类模型，那线上预测真实数据应该怎么办？应该不能直接放入训练的分类模型中去吧？

答：当然不能，要用你从训练数据里面得到的那个降维矩阵对测试数据降维，然后再送给分类器。

如何理解？如何操作？
参考PCA+SVM的模型的保存及使用

训练模型的代码，仔细阅读注释内容

import numpy as np
import os
import cv2
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pickle
import sklearn.metrics as sm
from sklearn.metrics import classification_report
import sklearn.svm as svm
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from time import time
from sklearn.model_selection import GridSearchCV

def cv_imread(file_path):
    cv_img = cv2.imdecode(np.fromfile(file_path,dtype=np.uint8),-1) #打开文件中含有中文路径的图片
    return cv_img
def search_files(directory):
    images=[]
    labels=[]
    for curdir, subdir, files in os.walk(directory):#根据路径,得到文件夹的的地址, 下面的子目录名称, 目录下的所有文件
        for file in files:
            if file.endswith('.jpg'):
                label = curdir.split(os.path.sep)[-1]#用分隔符'\' 进行切割,得到所在的文件夹最后一个单词,作为标签
                path = os.path.join(curdir,file) #拼接路径
                img = cv_imread(path)
                img_gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
                img = cv2.resize(img_gray,(256,256))
                h,w = img.shape
                img_col = img.reshape(h*w)
                images.append(img_col)
                labels.append(label)
    return images,labels
                
img , labels = search_files('./train_pic')
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(labels)
X, Y = np.array(img), np.array(labels)

train_x, test_x, train_y, test_y = train_test_split(X,Y, test_size=0.25,random_state=7)

t0 = time()
#这里我一直没想明白怎么把他给保存了, 方便以后的使用
pca = PCA(n_components=20, svd_solver='randomized', whiten=True).fit(train_x) #取20个特征,并且fit 这个train_x
print("done in %0.3fs" % (time() - t0))

X_train_pca = pca.transform(train_x) #得到训练集投影系数
X_test_pca = pca.transform(test_x)  #得到测试集投影系数
#这是找到最佳的参数.
param_grid = {'C': [1e3, 5e3, 1e4, 5e4, 1e5],
              'gamma': [0.0001, 0.0005, 0.001, 0.005, 0.01, 0.1], }
clf = GridSearchCV(SVC(kernel='poly', class_weight='balanced'), param_grid)
clf = clf.fit(X_train_pca, train_y)
output_dir = './output'
print("Best estimator found by grid search:")
print(clf.best_estimator_)
y_pred = clf.predict(X_test_pca)
print("done in %0.3fs" % (time() - t0))
print(classification_report(test_y, y_pred))

def save_model(model,pca,label_encoder,output_file):
    try:
        with open(output_file,'wb') as outfile:
            pickle.dump({
                'model':model,
                'pca_fit':pca,
                'label_encoder':label_encoder
            },outfile)
        return True
    except:
        return False
#这里重点说一下, 这个pca是能够直接保存并使用的,自己之前忘了这回事, 纠结了好久,不知道预测的时候怎么办.
save_model(clf,pca,label_encoder,os.path.join(output_dir,'model_svm_poly_test010412.pkl'))

这一步已经训练并保存好了模型,接下来就可以进行模型的预测及使用了.

import pickle
import os
import cv2
import numpy as np
from sklearn.decomposition import PCA
output_dir='./output/'
'''定义一个用于分类的Predictor'''

class Predictor(object):
    def __init__(self,model_file):
        with open(model_file,'rb') as infile:
            self.loaded = pickle.load(infile)
        self.model = self.loaded['model']
        #把刚才训练的模型中的pca 读取出来, 后面可以直接对单张图片进行transform, 并进行预测了.
        self.pca = self.loaded['pca_fit']
        self.label_encoder = self.loaded['label_encoder']
    def cv_imread(self,file_path):
        cv_img = cv2.imdecode(np.fromfile(file_path,dtype=np.uint8),-1)
        return cv_img

    '''实现分类逻辑'''
    def predict(self,img_file):
        '''读取图像文件'''
        img = self.cv_imread(img_file)
        img_gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
        img = cv2.resize(img_gray,(256,256))
        h,w = img.shape
        img_col = np.array([img.reshape(h*w)])
        #对图片进行transform 得到特征
        X_test_pca =self.pca.transform(img_col)
        '''预测分类'''
        y = self.model.predict(X_test_pca)
        # print(y)
        return y
predictor = Predictor('./output/model_svm_poly_test010412.pkl')
'''对新图片进行分类'''
label = predictor.predict('C:/Users/ugc/Desktop/20190325分类器训练图片/AI测试图片集/train/train_big_small/test_data/big/22.jpg')

ok,主要将 pca 这个直接保存成文件, 以后使用直接调用文件即可，这样就方便多了。网上的资料很少，我觉得实际环境中降维的工作很少用到，再多的维度分布式系统也可以处理，单机或者比赛中可能会用到降维。

应用pca降维就是将fit后的数据保存成文件，正式环境调用这个文件，对数据进行降维，再将降维后的数据输入到模型中。

参考：

https://blog.csdn.net/rocketye/article/details/89246549