蒸汽预测之网格搜索调优模型

首先,导入所要的库

import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
plt.rcParams.update({'figure.max_open_warning': 0})
import seaborn as sns

# modelling
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RepeatedKFold, cross_val_score, cross_val_predict, KFold
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.svm import LinearSVR, SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
#from xgboost import XGBRegressor
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler
加载数据
# load_dataset  加载数据
with open("蒸汽预测数据/zhengqi_train.txt")  as fr:
    data_train=pd.read_table(fr,sep="\t")
with open("蒸汽预测数据/zhengqi_test.txt") as fr_test:
    data_test=pd.read_table(fr_test,sep="\t")

#print(data_train)
#print(data_test)
合并test train
# merge train_set and test_set  合并test train
data_train["oringin"]="train"
data_test["oringin"]="test"
data_all=pd.concat([data_train,data_test],axis=0,ignore_index=True)
print(data_all)

删除 train 和test 数据分布差异大的特征变量

data_all.drop(["V5","V9","V11","V17","V22","V28"],axis=1,inplace=True)  # 删除

 

数据归一化,标准化
# normalise numeric columns 数据归一化,标准化
cols_numeric=list(data_all.columns)  # 获取列名

cols_numeric.remove("oringin")
def scale_minmax(col):
    return (col-col.min())/(col.max()-col.min())

scale_cols = [col for col in cols_numeric if col!='target']

data_all[scale_cols] = data_all[scale_cols].apply(scale_minmax, axis=0)  # axis=0 :纵向操作
print(data_all)

 

  对特征变量进行Box-Cox变换,使其满足正态性
  Box-Cox变换是Box和Cox在1964年提出的一种广义幂变换方法,是统计建模中常用的一种数据变换,
  用于连续的响应变量不满足正态分布的情况。Box-Cox变换之后,可以一定程度上减小不可观测的误差和预测变量的相关性。
  Box-Cox变换的主要特点是引入一个参数,通过数据本身估计该参数进而确定应采取的数据变换形式,
  Box-Cox变换可以明显地改善数据的正态性、对称性和方差相等性,对许多实际数据都是行之有效的

 

cols_transform=data_all.columns[0:-2]
for col in cols_transform:
    # transform column
    data_all.loc[:,col], _ = stats.boxcox(data_all.loc[:,col]+1)
# 标签数据对数变换数据,使数据更符合正态,并画图展示
# Log Transform SalePrice to improve normality
sp = data_train.target
data_train.target1 = np.power(1.5, sp)  # 1.5 的 sp 次方
print(data_train.target1.describe())

plt.figure(figsize=(12,4))
plt.subplot(1,2,1)
sns.distplot(data_train.target1.dropna(),fit=stats.norm)
plt.subplot(1,2,2)
_=stats.probplot(data_train.target1.dropna(), plot=plt)

 

获取训练和测试数据
#  获取训练和测试数据
# function to get training samples
def get_training_data():
    # extract training samples
    from sklearn.model_selection import train_test_split
    df_train = data_all[data_all["oringin"] == "train"]
    df_train["label"] = data_train.target1  # 标签数据对数变换的数据
    # split SalePrice and features
    y = df_train.target
    X = df_train.drop(["oringin", "target", "label"], axis=1)
    X_train, X_valid, y_train, y_valid = train_test_split(X,
                                                          y,
                                                          test_size=0.3,
                                                          random_state=100)
    return X_train, X_valid, y_train, y_valid

 

 

 获取检测数据

# extract test data (without SalePrice)
def get_test_data():
    df_test = data_all[data_all["oringin"] == "test"].reset_index(drop=True)
    return df_test.drop(["oringin", "target"], axis=1)

构建评分函数

#  评分函数
from sklearn.metrics import make_scorer
# metric for evaluation
def rmse(y_true, y_pred):
    diff = y_pred - y_true
    sum_sq = sum(diff ** 2)
    n = len(y_pred)

    return np.sqrt(sum_sq / n)  # 均方误差


def mse(y_ture, y_pred):
    return mean_squared_error(y_ture, y_pred)  # 均方误差


# scorer to be used in sklearn model fitting
rmse_scorer = make_scorer(rmse, greater_is_better=False)
mse_scorer = make_scorer(mse, greater_is_better=False)

 使用算法剔除获取的异常数据,并画图

# 获取异常数据,并画图
# function to detect outliers based on the predictions of a model
def find_outliers(model, X, y, sigma=3):
    # predict y values using model
    try:
        y_pred = pd.Series(model.predict(X), index=y.index)
    # if predicting fails, try fitting the model first
    except:
        model.fit(X, y)
        y_pred = pd.Series(model.predict(X), index=y.index)

    # calculate residuals between the model prediction and true y values
    resid = y - y_pred
    mean_resid = resid.mean()
    std_resid = resid.std()  # 方差

    # calculate z statistic, define outliers to be where |z|>sigma
    z = (resid - mean_resid) / std_resid
    outliers = z[abs(z) > sigma].index  # 获取其索引

    # print and plot the results
    print('R2=', model.score(X, y))
    print('rmse=', rmse(y, y_pred))
    print("mse=", mean_squared_error(y, y_pred))
    print('---------------------------------------')

    print('mean of residuals:', mean_resid)
    print('std of residuals:', std_resid)
    print('---------------------------------------')

    print(len(outliers), 'outliers:')
    print(outliers.tolist())

    plt.figure(figsize=(15, 5))
    ax_131 = plt.subplot(1, 3, 1)
    plt.plot(y, y_pred, '.')
    plt.plot(y.loc[outliers], y_pred.loc[outliers], 'ro')
    plt.legend(['Accepted', 'Outlier'])
    plt.xlabel('y')
    plt.ylabel('y_pred');

    ax_132 = plt.subplot(1, 3, 2)
    plt.plot(y, y - y_pred, '.')
    plt.plot(y.loc[outliers], y.loc[outliers] - y_pred.loc[outliers], 'ro')
    plt.legend(['Accepted', 'Outlier'])
    plt.xlabel('y')
    plt.ylabel('y - y_pred');

    ax_133 = plt.subplot(1, 3, 3)
    z.plot.hist(bins=50, ax=ax_133)
    z.loc[outliers].plot.hist(color='r', bins=50, ax=ax_133)
    plt.legend(['Accepted', 'Outlier'])
    plt.xlabel('z')

    plt.savefig('outliers.png')

    return outliers


# get training data
from sklearn.linear_model import Ridge
X_train, X_valid, y_train, y_valid = get_training_data()
test=get_test_data()


# find and remove outliers using a Ridge model
outliers = find_outliers(Ridge(), X_train, y_train)

# permanently remove these outliers from the data
#df_train = data_all[data_all["oringin"]=="train"]
#df_train["label"]=data_train.target1
#df_train=df_train.drop(outliers)
X_outliers=X_train.loc[outliers]
y_outliers=y_train.loc[outliers]
# permanently remove these outliers from the data
X_t=X_train.drop(outliers)
y_t=y_train.drop(outliers)

 

 

使用删除异常的数据进行模型训练
def get_trainning_data_omitoutliers():
    y1=y_t.copy()
    X1=X_t.copy()
    return X1, y1
采用网格搜索训练模型
# 采用网格搜索训练模型

from sklearn.preprocessing import StandardScaler

def train_model(model, param_grid=[], X=[], y=[], splits=5, repeats=5):

    # get unmodified training data, unless data to use already specified
    if len(y) == 0:
        X, y = get_trainning_data_omitoutliers()
        #poly_trans=PolynomialFeatures(degree=2)
        #X=poly_trans.fit_transform(X)
        #X=MinMaxScaler().fit_transform(X)

    # create cross-validation method
    rkfold = RepeatedKFold(n_splits=splits, n_repeats=repeats)  # n_repeats  重复cross-validation次数
#  使用RepeatedKFold(n_splits=splits, n_repeats=repeats)可以做splits折交叉验证,
    #  做repeats次(比如splits=10, repeats=2, 则第一次,用1种参数跑10折得到10次结果,第二次重新划分10折,
    #  还是用一样的参数跑10折。这样相当于使用一样的参数不一样的train/val划分跑了20次)
    # perform a grid search if param_grid given
    if len(param_grid) > 0:
        # setup grid search parameters
        gsearch = GridSearchCV(model,
                               param_grid,
                               cv=rkfold,
                               scoring="neg_mean_squared_error",
                               verbose=1,
                               return_train_score=True)

        # search the grid
        gsearch.fit(X, y)

        # extract best model from the grid
        model = gsearch.best_estimator_
        best_idx = gsearch.best_index_  # 给出最佳模型的索引

        # get cv-scores for best model
        grid_results = pd.DataFrame(gsearch.cv_results_)  # 字典形式转化成dataframe
        cv_mean = abs(grid_results.loc[best_idx, 'mean_test_score'])  # ?给出最佳模型的验证平均分数
        cv_std = grid_results.loc[best_idx, 'std_test_score']  # ?# ?给出最佳模型的验证方差分数

    # no grid search, just cross-val score for given model
    else:
        grid_results = []
        cv_results = cross_val_score(model,
                                     X,
                                     y,
                                     scoring="neg_mean_squared_error",
                                     cv=rkfold)
        cv_mean = abs(np.mean(cv_results))
        cv_std = np.std(cv_results)

    # combine mean and std cv-score in to a pandas series
    cv_score = pd.Series({'mean': cv_mean, 'std': cv_std})

    # predict y using the fitted model
    y_pred = model.predict(X)

    # print stats on model performance
    print('----------------------')
    print(model)
    print('----------------------')
    print('score=', model.score(X, y))
    print('rmse=', rmse(y, y_pred))
    print('mse=', mse(y, y_pred))
    print('cross_val: mean=', cv_mean, ', std=', cv_std)

    # residual plots
    y_pred = pd.Series(y_pred, index=y.index)
    resid = y - y_pred
    mean_resid = resid.mean()
    std_resid = resid.std()
    z = (resid - mean_resid) / std_resid
    n_outliers = sum(abs(z) > 3)

    plt.figure(figsize=(15, 5))
    ax_131 = plt.subplot(1, 3, 1)
    plt.plot(y, y_pred, '.')
    plt.xlabel('y')
    plt.ylabel('y_pred')
    plt.title('corr = {:.3f}'.format(np.corrcoef(y, y_pred)[0][1]))
    ax_132 = plt.subplot(1, 3, 2)
    plt.plot(y, y - y_pred, '.')
    plt.xlabel('y')
    plt.ylabel('y - y_pred')
    plt.title('std resid = {:.3f}'.format(std_resid))

    ax_133 = plt.subplot(1, 3, 3)
    z.plot.hist(bins=50, ax=ax_133)
    plt.xlabel('z')
    plt.title('{:.0f} samples with z>3'.format(n_outliers))

    return model, cv_score, grid_results

# places to store optimal models and scores
opt_models = dict()

score_models = pd.DataFrame(columns=['mean', 'std'])
# no. k-fold splits
splits=5
# no. k-fold iterations
repeats=5

 使用岭回归

# 岭回归
model = 'Ridge'

opt_models[model] = Ridge()
alph_range = np.arange(0.25, 6, 0.25)
param_grid = {'alpha': alph_range}

opt_models[model], cv_score, grid_results = train_model(
    opt_models[model], param_grid=param_grid,
    splits=splits, repeats=repeats)

cv_score.name = model  # cv.score 为series
score_models = score_models.append(cv_score)
#         mean std
# Ridge    1   2

plt.figure()
plt.errorbar(alph_range, abs(grid_results['mean_test_score']),
             abs(grid_results['std_test_score'])/np.sqrt(splits*repeats))
plt.xlabel('alpha')
plt.ylabel('score')

结果

----------------------
Ridge(alpha=0.25, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)
----------------------
[Parallel(n_jobs=1)]: Done 575 out of 575 | elapsed:    2.1s finished
score= 0.8963563380306225
rmse= 0.31899649233050614
mse= 0.1017587621191668
cross_val: mean= 0.10626958248501453 , std= 0.008332462869169874

 

 

 

 

 

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
以下是一个基于线性回归模型的工业蒸汽预测代码: ``` clc; clear all; close all; % 读取数据 data = load('data.txt'); % 将数据保存在 data.txt 文件中 X = data(:,1:end-1); % 获取输入特征 Y = data(:,end); % 获取输出值 % 数据预处理 X = zscore(X); % 对输入特征进行标准化处理 Y = zscore(Y); % 对输出值进行标准化处理 % 划分训练集和测试集 [trainInd,testInd] = dividerand(size(X,1),0.8,0.2); % 80%数据用于训练,20%数据用于测试 X_train = X(trainInd,:); Y_train = Y(trainInd); X_test = X(testInd,:); Y_test = Y(testInd); % 训练线性回归模型 mdl = fitlm(X_train,Y_train); % 预测测试集 Y_pred = predict(mdl,X_test); % 计算测试集的均方根误差 rmse = sqrt(mean((Y_pred - Y_test).^2)); disp(['测试集的均方根误差为 ',num2str(rmse)]); % 绘制预测结果 figure(); plot(Y_test,'b'); hold on; plot(Y_pred,'r--'); legend('真实值','预测值'); xlabel('样本序号'); ylabel('蒸汽量'); title('工业蒸汽预测结果'); ``` 该代码首先读取保存在 `data.txt` 文件中的数据,并对输入特征和输出值进行标准化处理。然后,将数据划分为训练集和测试集,并使用 `fitlm` 函数训练线性回归模型。接着,使用 `predict` 函数对测试集进行预测,并计算测试集的均方根误差(RMSE)。最后,绘制预测结果图。 需要注意的是,该代码仅供参考,具体的参数需要根据实际情况进行调整。另外,如果需要提高预测精度,可以尝试使用其他机器学习算法,或者进行特征工程等操作。
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值