光谱特征选择之模拟退火算法SA

肠胃是一种精神

已于 2024-04-18 23:22:47 修改

阅读量682

点赞数 10

文章标签： python 机器学习

于 2024-04-18 22:51:46 首次发布

本文链接：https://blog.csdn.net/weixin_42621095/article/details/137940907

版权

本文讲述了如何使用模拟退火算法结合PLS回归优化光谱数据的特征选择，提升预测模型性能。

摘要由CSDN通过智能技术生成

光谱特征选择之模拟退火算法SA

import numpy as np
import matplotlib.pyplot as plt
import matplotlib.collections as collections
import pandas as pd
from scipy import interpolate
from scipy.signal import savgol_filter
from sys import stdout
from sklearn.cross_decomposition import PLSRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_predict
from pandas import read_csv

data = read_csv('E:\python data\peach_spectra_brix.csv',header = None)
# wl = np.arange(730,1100.5,0.5)
X = np.array(data.loc[:,1:])
y = data.loc[:,0]

def base_pls(X,y,n_components, return_model=False):
    # Simple PLS
    pls_simple = PLSRegression(n_components=n_components)
    # Fit
    pls_simple.fit(X, y)
    # Cross-validation
    y_cv = cross_val_predict(pls_simple, X, y, cv=10)
    # Calculate scores
    score = r2_score(y, y_cv)
    rmsecv = np.sqrt(mean_squared_error(y, y_cv))
    if return_model == False:
        return(y_cv, score, rmsecv)
    else:
        return(y_cv, score, rmsecv, pls_simple)

def pls_optimise_components(X, y, npc):
    rmsecv = np.zeros(npc)
    for i in range(1,npc+1,1):
        # Simple PLS
        pls_simple = PLSRegression(n_components=i)
        # Fit
        pls_simple.fit(X, y)
        # Cross-validation
        y_cv = cross_val_predict(pls_simple, X, y, cv=10)
        # Calculate scores
        score = r2_score(y, y_cv)
        rmsecv[i-1] = np.sqrt(mean_squared_error(y, y_cv))
    # Find the minimum of ther RMSE and its location
    opt_comp, rmsecv_min = np.argmin(rmsecv),  rmsecv[np.argmin(rmsecv)]
    return (opt_comp+1, rmsecv_min)

def msc(input_data, reference=None):
    ''' Perform Multiplicative scatter correction'''
    # mean centre correction
    for i in range(input_data.shape[0]):
        input_data[i,:] -= input_data[i,:].mean()
    # Get the reference spectrum. If not given, estimate it from the mean
    if reference is None:
        # Calculate mean
        ref = np.mean(input_data, axis=0)
    else:
        ref = reference
    # Define a new array and populate it with the corrected data
    data_msc = np.zeros_like(input_data)
    for i in range(input_data.shape[0]):
        # Run regression
        fit = np.polyfit(ref, input_data[i,:], 1, full=True)
        # Apply correction
        data_msc[i,:] = (input_data[i,:] - fit[0][1]) / fit[0][0]
    return (data_msc, ref)

def band_selection_sa(X, y, n_of_bands, max_lv, n_iter):
    p = np.arange(X.shape[1])
    np.random.shuffle(p)
    bands = p[:n_of_bands]  # Selected Bands. Start off with a random selection
    nbands = p[n_of_bands:]  # Excluded bands
    Xop = X[:, bands]  # This is the array to be optimised
    # Run a PLS optimising the number of latent variables
    opt_comp, rmsecv_min = pls_optimise_components(Xop, y, max_lv)
    rms = []  # Here we store the RMSE value as the optimisation progresses
    for i in range(n_iter):
        cool = 0.005 * rmsecv_min  # cooling parameter. It decreases with the RMSE
        new_bands = np.copy(bands)
        new_nbands = np.copy(nbands)
        r1, r2 = np.random.randint(0, n_of_bands, 2), np.random.randint(n_of_bands, X.shape[1] - n_of_bands, 2)
        #随机交换两个位置
        el1, el2 = new_bands[r1], new_nbands[r2]
        new_bands[r1] = el2
        new_nbands[r2] = el1
        Xop = X[:, new_bands]
        opt_comp_new, rmsecv_min_new = pls_optimise_components(Xop, y, max_lv)
        # If the new RMSE is less than the previous, accept the change
        if (rmsecv_min_new < rmsecv_min):
            bands = new_bands
            nbands = new_nbands
            opt_comp = opt_comp_new
            rmsecv_min = rmsecv_min_new
            rms.append(rmsecv_min_new)

            stdout.write("\r" + str(i))
            stdout.write(" ")
            stdout.write(str(opt_comp_new))
            stdout.write(" ")
            stdout.write(str(rmsecv_min_new))
            stdout.flush()

        # If the new RMSE is larger than the previous, accept it with some probability
        # dictated by the cooling parameter
        if (rmsecv_min_new > rmsecv_min):
            prob = np.exp(-(rmsecv_min_new - rmsecv_min) / cool)  # probability
            if (np.random.random() < prob):
                bands = new_bands
                nbands = new_nbands
                opt_comp = opt_comp_new
                rmsecv_min = rmsecv_min_new
                rms.append(rmsecv_min_new)

                stdout.write("\r" + str(i))
                stdout.write(" ")
                stdout.write(str(opt_comp_new))
                stdout.write(" ")
                stdout.write(str(rmsecv_min_new))
                stdout.flush()
            else:
                rms.append(rmsecv_min)

    stdout.write("\n")
    print(np.sort(bands))
    print('end')
    return (bands, opt_comp, rms)


Xc, ref = msc(X)
Xc = savgol_filter(Xc, 11, 3, deriv=1)
bands, opt_comp, rms = band_selection_sa(Xc, y, n_of_bands=40, max_lv=40, n_iter=10000)
predicted, r2cv, rmscv = base_pls(Xc[:, bands], y, opt_comp)
print("PLS optimised bands:")
print("   R2: %5.3f, RMSE: %5.3f" % (r2cv, rmscv))
print("   Number of Latent Variables: "+str(opt_comp))

plt.figure(figsize=(8, 6))
plt.semilogx(rms)
plt.xlabel("Iteration ")
plt.ylabel("RMSECV")
plt.show()