非时序和时序数据进行交叉验证方法
from sklearn.model_selection import cross_val_score, KFold
kf = KFold(n_splits=10, random_state=42, shuffle=False)
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(max_train_size=None, n_splits=17)
def cv_rmse(model, train_X, train_y):
rmse= np.sqrt(-cross_val_score(model, train_X, train_y, scoring="neg_mean_squared_error", cv = kf))
return(rmse)
def cv_mae(model, train_X, train_y):
cv_mae = np.mean(-cross_val_score(model, train_X, train_y, scoring="neg_mean_absolute_error", cv = tscv))
return cv_mae
def cv_mae_(model, train_X, train_y):
val_loss = -cross_val_score(model, train_X, train_y, scoring="neg_mean_absolute_error", cv=tscv)
print('val loss is: {0}'.format(val_loss))
import matplotlib.pyplot as plt
plt.plot(val_loss, marker = 'o')
plt.show()
cv_mae = np.mean(-cross_val_score(model, train_X, train_y, scoring="neg_mean_absolute_error", cv = tscv))
return cv_mae
from sklearn.linear_model import MultiTaskLassoCV
from sklearn.linear_model import ElasticNetCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.multioutput import MultiOutputRegressor
def build_model(train_X, train_y):
mult_lasso = MultiTaskLassoCV()
score_lasso_mean = cv_mae(mult_lasso, train_X, train_y)
mult_lasso = mult_lasso.fit(train_X, train_y)
elncv = ElasticNetCV()
mult_elncv = MultiOutputRegressor(elncv)
score_elncv_mean = cv_mae(mult_elncv, train_X, train_y)
mult_elncv = mult_elncv.fit(train_X, train_y)
dtr = DecisionTreeRegressor()
mult_dtr = MultiOutputRegressor(dtr)
cv_mae_dtr = cv_mae(mult_dtr, train_X, train_y)
mult_dtr = mult_dtr.fit(train_X, train_y)
rf = RandomForestRegressor()
mult_rf = MultiOutputRegressor(rf)
cv_mae_rf = cv_mae(mult_rf, train_X, train_y)
mult_rf = mult_rf.fit(train_X, train_y)
adbt = AdaBoostRegressor(random_state=42)
mult_adbt = MultiOutputRegressor(adbt)
cv_mae_adbt = cv_mae(mult_adbt, train_X, train_y)
mult_adbt = mult_adbt.fit(train_X, train_y)
gbdt = GradientBoostingRegressor()
mult_gbdt = MultiOutputRegressor(gbdt)
cv_mae_gbdt = cv_mae(mult_gbdt, train_X, train_y)
mult_gbdt = mult_gbdt.fit(train_X, train_y)
xgb = XGBRegressor()
mult_xgb = MultiOutputRegressor(xgb)
cv_mae_xgb = cv_mae(mult_xgb, train_X, train_y)
mult_xgb = mult_xgb.fit(train_X, train_y)
return mult_adbt
from sklearn.externals import joblib
def save_model(mult_model, model_save_path):
"""
:param mult_model: 待保存的模型对象
:param model_save_path: 保存路径,例如 "./model.pkl"
:return: 没有返回值
"""
joblib.dump(mult_model, model_save_path)
def load_model(model_path):
"""
:param model_path: 模型路径
:return: 返回加载后的模型对象
"""
return joblib.load(model_path)
def train_model(train_X, train_y):
adbt_best_params = {'n_estimators': 170, 'learning_rate': 0.21}
dtr_best_params = {'splitter': 'best', 'max_depth': 8, 'min_samples_split': 0.11, 'min_samples_leaf': 0.03, 'random_state': 42}
model = AdaBoostRegressor(base_estimator=DecisionTreeRegressor(**dtr_best_params), **adbt_best_params)
mult_model = MultiOutputRegressor(model)
mult_model = mult_model.fit(train_X, train_y)
return mult_model
from parameter_optimize import parameter_optimize_adbt
if __name__ == '__main__':
file_name = 'HLK-21C02(4℃).csv'
file_name_ = os.path.splitext(file_name)[0]
n_cnt = 4
df_file = data_preprocess(file_name_, n_cnt)
df_file = df_file
df_y =df_file
goals_list = [str(x) for x in range(0, n_cnt * 2)]
features_list = [ele for ele in list(df_file.columns) if ele not in goals_list]
num_test = int(len(df_file)*0.2)
test_y = df_y.loc[len(df_file) - num_test:][goals_list]
test_y = test_y.reset_index(drop=True)
df_train = df_file.loc[:len(df_file) - num_test-1]
df_train = df_train.reset_index(drop=True)
train_X = df_train[features_list]
train_y = df_train[goals_list]
df_test = df_file.loc[len(df_file) - num_test:]
df_test = df_test.reset_index(drop=True)
test_X = df_test[features_list]
mult_model = train_model(train_X, train_y)
res_pre = mult_model.predict(test_X)
res_pre = np.round(res_pre)
df_res = pd.DataFrame(res_pre)
df_res[0] = df_res[0].apply(lambda x: x-86400 if x>86400 else x)
df_res[len(goals_list)-1] = df_res[len(goals_list)-1].apply(lambda x: x-86400 if x>86400 else x)
diff_y_yhat = df_res.values - test_y
eval_model = np.mean(np.mean(abs(df_res.values - test_y.values)))
eval_model_3 = np.mean(abs(df_res.values - test_y.values), axis=1)
from sklearn.metrics import mean_absolute_error
eval_model_1 = mean_absolute_error(test_y.values, df_res, multioutput='raw_values')
eval_model_2 = mean_absolute_error(test_y.values, df_res)
print ()