数据读取与深拷贝
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore")
train = pd.read_csv("./练习数据/ch03_practice_1.csv")
train
train_saved = train.copy()
def load_data():
train = train_saved.copy()
return train
train = load_data()
特征变化
标准化(-1,1)
# 1.1特征变化练习--标准化
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(train)
train_x = scaler.transform(train)
train_x
归一化(0,1)
# 特征变化练习-max-min归一化
train = load_data()
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
train_x = scaler.fit(train)
train_x = scaler.transform(train)
train_x
非线性变化
对数变换
# 1.2特征变化练习-非线性变化
x = np.array([1.0, 10.0, 100.0, 1000.0, 10000.0])
# 取对数
x1 = np.log(x)
# 加1后取对数
x2 = np.log1p(x)
# 对绝对值取对数并加上原本符号(正负)
x3 = np.sign(x) * np.log(np.abs(x))
import matplotlib.pyplot as plt
import seaborn as sns
# 对数变换
plt.subplot(1, 4, 1)
sns.distplot(x)
plt.subplot(1, 4, 2)
sns.distplot(x1)
plt.subplot(1, 4, 3)
sns.distplot(x2)
plt.subplot(1, 4, 4)
sns.distplot(x3)
plt.show()
box-cox变化
# box-cox变化
train = load_data()
cols = [c for c in train.columns.tolist() if (train[c] > 0.0).all()]
from sklearn.preprocessing import PowerTransformer
pt = PowerTransformer(method="box-cox")
pt.fit(train[cols])
train_x = pt.transform(train[cols])
plt.subplot(1, 4, 1)
sns.distplot(train[cols])
plt.subplot(1, 4, 2)
sns.distplot(train_x)
plt.show()
yeo-johnson变换
# yeo-johnson变换
train = load_data()
pt = PowerTransformer(method="yeo-johnson")
pt.fit(train[cols])
train_x = pt.transform(train[cols])
plt.subplot(1, 4, 1)
sns.distplot(train[cols])
plt.subplot(1, 4, 2)
sns.distplot(train_x)
plt.show()
特征编码
FeatureHasher
# 1.3特征编码-FeatureHasher练习
train = load_data()
from sklearn.feature_extraction import FeatureHasher
col = ["product", "year", "month", "day"]
for c in col:
fh = FeatureHasher(n_features=4, input_type="string")
hash_train = fh.transform(train[[c]].astype(str).values)
hash_train = pd.DataFrame(
hash_train.todense(), columns=[f"{c}_{i}" for i in range(4)]
)
train = pd.concat([train, hash_train], axis=1)
train
frequency encoding
# 特征编码-frequency encoding练习
train = load_data()
for c in col:
freq = train[c].value_counts()
train[c] = train[c].map(freq)
train
target encoding
# 特征编码-target encoding练习
train = pd.read_csv("./练习数据/ch03_practice_2.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
from sklearn.model_selection import KFold
# target encoding
for c in col:
data_tmp = pd.DataFrame({c: train_x[c], "target": train_y})
target_mean = data_tmp.groupby(c)["target"].mean()
tmp = np.repeat(np.nan, train_x.shape[0]) # 通过repeat构造一个指定大小与数值的narray
kf = KFold(n_splits=4, shuffle=True, random_state=0)
for idx_1, idx_2 in kf.split(train_x):
target_mean = data_tmp.iloc[idx_1].groupby(c)["target"].mean()
tmp[idx_2] = train_x[c].iloc[idx_2].map(target_mean)
train_x[c] = tmp
train_x
宽表格、长表格
# 1.4宽表格、长表格
df_time = pd.read_csv("./练习数据/ch03_practice_3.csv", index_col=0)
df_time.index = pd.to_datetime(df_time.index)
df_time
df_wide = df_time
df_long = df_wide.stack().reset_index(1) # stack()将列索引转换为最内层的行索引
df_long.columns = ["id", "value"]
df_long
df_wide = df_long.pivot(
index=None, columns="id", values="value"
) # 重塑数据(产生一个“pivot”表格)以列值为标准
df_wide
特征构造
利用缺失值构造特征
# 2.2利用每一行缺失特征数构造特征以及根据是否缺失构造特征
train["nan_count"] = train.isnull().sum(axis=1)
train["year_nan"] = train.year.isnull().astype(int)
train
滑窗特征构造
# 2.4滑窗特征构造
df_time = pd.read_csv("./练习数据/ch03_practice_3.csv", index_col=0)
df_time.index = pd.to_datetime(df_time.index)
df_time_A = df_time[["A"]]
df_time_A_saved = df_time_A.copy()
def load_data():
df_time_A = df_time_A_saved.copy()
return df_time_A
x = load_data()
x_lag1 = x.shift(1)
x_lag7 = x.shift(7)
x["lag1"] = x_lag1
x["lag7"] = x_lag7
x
x = load_data()
x = df_time_A
x_avg3 = x.shift(1).rolling(window=3).mean() # rolling()指定窗口大小的聚合函数
x_max7 = x.shift(1).rolling(window=7).max()
x_e7_avg = (x.shift(7) + x.shift(14) + x.shift(21)) / 3.0
x["avg3"] = x_avg3
x["max7"] = x_max7
x["e7_avg"] = x_e7_avg
x
无监督特征构造
# 2.5无监督特征构造
# pip install umap-learn
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(df_time)
df = scaler.transform(df_time)
UMAP算法
UMAP算法,该方法接受高于两个特征的数据集,输出低维图像用于探索数据集。相似的样本会趋于聚类在一起体现在输出的umap结果点图中。UMAP算法的目的是在低维图像上 展示高维空间中样本的簇以及样本点之间的联系。
import umap
um = umap.UMAP()
um.fit(df)
df_um = pd.DataFrame(um.transform(df), columns=[["um_1", "um_2"]], index=df_time.index)
df_um
Kmeans聚类
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(df_time)
df_clusters = kmeans.predict(df_time)
df_distances = kmeans.transform(df_time) # 计算每个样本到类中心点的距离
df_distances = pd.DataFrame(
df_distances,
columns=["distance_1", "distance_2", "distance_3"],
index=df_time.index,
)
df = pd.concat([df_time, df_distances], axis=1)
df
特征重要性
# 2.7特征重要性输出
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)
clf.fit(train_x, train_y)
fi = clf.feature_importances_ # 返回一个narray,每个值对应特征的重要性
idx = np.argsort(fi)[::-1]
top_features, top_importances = train_x.columns.values[idx][:5], fi[idx][:5]
print("random forest importance")
print(top_features, top_importances)
xgboost
import xgboost as xgb
dtrain = xgb.DMatrix(train_x, label=train_y)
params = {"objective": "binary:logistic", "silent": 1}
num_round = 20
model = xgb.train(params, dtrain, num_round)
fscore = model.get_score(importance_type="total_gain")
fscore = sorted(
[(k, v) for k, v in fscore.items()], key=lambda tpl: tpl[1], reverse=True
)
print("xgboost importance")
print(fscore[:5])
不断搜索
# 2.7 特征重要性输出(不断搜索)
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(train_x, train_y)
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True)
# 构造训练集合与验证集
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
import xgboost as xgb
from sklearn.metrics import log_loss
def evaluate(features):
dtrain = xgb.DMatrix(tr_x[features], label=tr_y)
dvalid = xgb.DMatrix(va_x[features], label=va_y)
params = {"objective": "binary:logistic", "silent": 1}
num_round = 10
early_stopping_rounds = 3
watchlist = [(dtrain, "train"), (dvalid, "eval")]
model = xgb.train(
params,
dtrain,
num_round,
evals=watchlist,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=0,
)
va_pred = model.predict(dvalid)
score = log_loss(va_y, va_pred) # log_loss越小模型越好
return score
best_score = 9999.0
candidates = np.random.RandomState(0).permutation(train_x.columns) # 对列名进行随机排列
selected = set([])
for feature in candidates:
fs = list(selected) + [feature]
score = evaluate(fs)
if score < best_score:
selected.add(feature)
best_score = score
print(f"selected:{feature}")
print(f"score:{score}")
print(f"selected features: {selected}")
模型调参(Xgboost)
贝叶斯优化
# 3.4 xgboost调参
# 贝叶斯寻优练习
class Model:
def __init__(self, params=None):
self.model = None
if params is None:
self.params = {}
else:
self.params = params
def fit(self, tr_x, tr_y, va_x, va_y):
params = {"objective": "binary:logistic", "silent": 1}
params.update(self.params)
num_round = 10
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
watchlist = [(dtrain, "train"), (dvalid, "eval")]
self.model = xgb.train(params, dtrain, num_round, evals=watchlist)
def predict(self, x):
data = xgb.DMatrix(x)
pred = self.model.predict(data)
return pred
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import log_loss
def score(params):
params["max_depth"] = int(params["max_depth"])
model = Model(params)
model.fit(tr_x, tr_y, va_x, va_y)
va_pred = model.predict(va_x)
score = log_loss(va_y, va_pred)
print(f"params: {params}, logloss: {score:.4f}")
history.append((params, score))
return {"loss": score, "status": STATUS_OK}
space = {
"min_child_weight": hp.quniform("min_child_weight", 1, 5, 1),
"max_depth": hp.quniform("max_depth", 3, 9, 1),
"gamma": hp.quniform("gamma", 0, 0.4, 0.1),
}
max_evals = 10
trials = Trials()
history = []
fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=max_evals)
history = sorted(history, key=lambda tpl: tpl[1])
best = history[0]
print(f"best params:{best[0]}, score:{best[1]:.4f}")
Xgboost参数空间
# 3.4.2Xgboost参数空间
params = {
"booster": "gbtree",
"objective": "binary:logistic",
"eta": 0.1,
"gamma": 0.0,
"alpha": 0.0,
"lambda": 1.0,
"min_child_weight": 1,
"max_depth": 5,
"subsample": 0.8,
"colsample_bytree": 0.8,
"random_state": 0,
}
param_space = {
"min_child_weight": hp.loguniform("min_child_weight", np.log(0.1), np.log(10)),
"max_depth": hp.quniform("max_depth", 3, 9, 1),
"subsample": hp.quniform("subsample", 0.6, 0.95, 0.05),
"colsample_bytree": hp.quniform("colsample_bytree", 0.6, 0.95, 0.05),
"gamma": hp.loguniform("gamma", np.log(1e-8), np.log(1.0))
# 资源充裕时再做如下寻优
# 'alpha' : hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),
# 'lambda' : hp.loguniform('lambda', np.log(1e-6), np.log(10.0))
}
神经网络调参
# 3.5神经网络调参
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True, random_state=0)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from hyperopt import hp
from keras.callbacks import EarlyStopping
from keras.layers import BatchNormalization
from keras.layers.advanced_activations import PReLU, ReLU
from keras.layers.core import Dense, Dropout
from keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import SGD, Adam
base_param = {
"input_dropout": 0.0,
"hidden_layers": 3,
"hidden_units": 96,
"hidden_activation": "relu",
"hidden_dropout": 0.2,
"batch_norm": "before_act",
"optimizer": {"type": "adam", "lr": 0.001},
"batch_size": 64,
}
param_space = {
"input_dropout": hp.quniform("input_dropout", 0, 0.2, 0.05),
"hidden_layers": hp.quniform("hidden_layers", 2, 4, 1),
"hidden_units": hp.quniform("hidden_units", 32, 256, 32),
"hidden_activation": hp.choice("hidden_activation", ["prelu", "relu"]),
"hidden_dropout": hp.quniform("hidden_dropout", 0, 0.3, 0.05),
"batch_norm": hp.choice("batch_norm", ["before_act", "no"]),
"optimizer": hp.choice(
"optimizer",
[
{
"type": "adam",
"lr": hp.loguniform("adam_lr", np.log(0.00001), np.log(0.01)),
},
{
"type": "sgd",
"lr": hp.loguniform("sgd_lr", np.log(0.00001), np.log(0.01)),
},
],
),
"batch_size": hp.quniform("batch_size", 32, 128, 32),
}
class MLP:
def __init__(self, params):
self.params = params
self.scaler = None
self.model = None
def fit(self, tr_x, tr_y, va_x, va_y):
input_dropout = self.params["input_dropout"]
hidden_layers = int(self.params["hidden_layers"])
hidden_units = int(self.params["hidden_units"])
hidden_activation = self.params["hidden_activation"]
hidden_dropout = self.params["hidden_dropout"]
batch_norm = self.params["batch_norm"]
optimizer_type = self.params["optimizer"]["type"]
optimizer_lr = self.params["optimizer"]["lr"]
batch_size = int(self.params["batch_size"])
# 标准化
self.scaler = StandardScaler()
tr_x = self.scaler.fit_transform(tr_x)
va_x = self.scaler.transform(va_x)
self.model = Sequential()
# 输入层
self.model.add(Dropout(input_dropout, input_shape=(tr_x.shape[1],)))
# 中间层
for i in range(hidden_layers):
self.model.add(Dense(hidden_units))
if batch_norm == "before_act":
self.model.add(BatchNormalization())
if hidden_activation == "prelu":
self.model.add(PReLU())
elif hidden_activation == "relu":
self.model.add(ReLU())
else:
raise NotImplementedError
self.model.add(Dropout(hidden_dropout))
# 输出层
self.model.add(Dense(1, activation="sigmoid"))
# 优化器
if optimizer_type == "sgd":
optimizer = SGD(lr=optimizer_lr, decay=1e-6, momentum=0.9, nesterov=True)
elif optimizer_type == "adam":
optimizer = Adam(lr=optimizer_lr, beta_1=0.9, beta_2=0.999, decay=0.0)
else:
raise NotImplementedError
# 目标函数和评估指标
self.model.compile(
loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"]
)
nb_epoch = 200
patience = 20
early_stopping = EarlyStopping(patience=patience, restore_best_weights=True)
history = self.model.fit(
tr_x,
tr_y,
epochs=nb_epoch,
batch_size=batch_size,
verbose=1,
validation_data=(va_x, va_y),
callbacks=[early_stopping],
)
def predict(self, x):
x = self.scaler.transform(x)
y_pred = self.model.predict(x)
y_pred = y_pred.flatten()
return y_pred
def score(params):
model = MLP(params)
model.fit(tr_x, tr_y, va_x, va_y)
va_pred = model.predict(va_x)
score = log_loss(va_y, va_pred)
print(f"params: {params}, logloss: {score:.4f}")
history.append((params, score))
return {"loss": score, "status": STATUS_OK}
max_evals = 10
trials = Trials()
history = []
fmin(score, param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals)
history = sorted(history, key=lambda tpl: tpl[1])
best = history[0]
print(f"best params:{best[0]}, score:{best[1]:.4f}")
线性模型
# 3.6线性模型练习
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
tr_x = scaler.fit_transform(tr_x)
va_x = scaler.transform(va_x)
model = LogisticRegression()
model.fit(tr_x, tr_y)
va_pred = model.predict_proba(va_x)
score = log_loss(va_y, va_pred)
print(f"logloss: {score:.4f}")
自定义评估函数
# 3.7自定义评估函数
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
test_x = pd.read_csv("./练习数据/ch03_practice_4_test.csv")
test_x = test_x.drop(["target"], axis=1)
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
import xgboost as xgb
from sklearn.metrics import log_loss
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
# 自定义目标函数和评估指标
def logregobj(preds, dtrain):
labels = dtrain.get_label()
preds = 1.0 / (1.0 + np.exp(-preds))
grad = preds - labels
hess = preds * (1.0 - preds)
return grad, hess
def evalerror(preds, dtrain):
labels = dtrain.get_label()
return "custom-error", float(sum(labels != (preds > 0.0))) / len(labels)
params = {"silent": 1, "random_state": 0}
num_round = 50
watchlist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, watchlist, obj=logregobj, feval=evalerror)
pred_val = bst.predict(dvalid)
pred = 1.0 / (1.0 + np.exp(-pred_val))
logloss = log_loss(va_y, pred)
print(logloss)
params = {"silent": 1, "random_state": 0, "objective": "binary:logistic"}
bst = xgb.train(params, dtrain, num_round, watchlist)
pred = bst.predict(dvalid)
logloss = log_loss(va_y, pred)
print(logloss)
# xgboost类mae作为目标函数
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
from sklearn.model_selection import KFold
kf = KFold(n_splits=4, shuffle=True)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
# xgboost类mae作为目标函数
def fair(preds, dtrain):
x = preds - dtrain.get_label()
c = 1.0
den = abs(x) + c
grad = c * x / den
hess = c * c / den ** 2
return grad, hess
import numpy as np
import xgboost as xgb
param = {"max_depth": 3, "eta": 1, "silent": 1}
watchlist = [(dvalid, "eval"), (dtrain, "train")]
num_round = 15
bst = xgb.train(param, dtrain, num_round, watchlist, fair)
模型融合
Stacking
# 3.8.1 stacking练习,搭建基模型
import numpy as np
import pandas as pd
import xgboost as xgb
from keras.layers import Dense, Dropout
from keras.models import Sequential
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
import tensorflow as tf
tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
class Model1Xgb:
def __init__(self):
self.model = None
def fit(self, tr_x, tr_y, va_x, va_y):
params = {
"objective": "binary:logistic",
"silent": 1,
"random_state": 0,
"eval_metric": "logloss",
}
num_round = 10
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
watchlist = [(dtrain, "train"), (dvalid, "eval")]
self.model = xgb.train(params, dtrain, num_round, evals=watchlist)
def predict(self, x):
data = xgb.DMatrix(x)
pred = self.model.predict(data)
return pred
class Model1NN:
def __init__(self):
self.model = None
self.scaler = None
def fit(self, tr_x, tr_y, va_x, va_y):
self.scaler = StandardScaler()
self.scaler.fit(tr_x)
batch_size = 128
epochs = 10
tr_x = self.scaler.transform(tr_x)
va_x = self.scaler.transform(va_x)
model = Sequential()
model.add(Dense(256, activation="relu", input_shape=(tr_x.shape[1],)))
model.add(Dropout(0.2))
model.add(Dense(256, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(1, activation="sigmoid"))
model.compile(loss="binary_crossentropy", optimizer="adam")
history = model.fit(
tr_x,
tr_y,
batch_size=batch_size,
epochs=epochs,
verbose=1,
validation_data=(va_x, va_y),
)
self.model = model
def predict(self, x):
x = self.scaler.transform(x)
pred = self.model.predict(x).reshape(-1)
return pred
class Model2Linear:
def __init__(self):
self.model = None
self.scaler = None
def fit(self, tr_x, tr_y, va_x, va_y):
self.scaler = StandardScaler()
self.scaler.fit(tr_x)
tr_x = self.scaler.transform(tr_x)
self.model = LogisticRegression(solver="lbfgs")
self.model.fit(tr_x, tr_y)
def predict(self, x):
x = self.scaler.transform(x)
pred = self.model.predict_proba(x)[:, 1]
return pred
def predict_cv(model, train_x, train_y, test_x):
preds = []
preds_test = []
va_idxes = []
kf = KFold(n_splits=4, shuffle=True, random_state=0)
for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
model.fit(tr_x, tr_y, va_x, va_y)
pred = model.predict(va_x)
preds.append(pred)
pred_test = model.predict(test_x)
preds_test.append(pred_test)
va_idxes.append(va_idx)
va_idxes = np.concatenate(va_idxes)
preds = np.concatenate(preds, axis=0)
order = np.argsort(va_idxes)
pred_train = preds[order]
preds_test = np.mean(preds_test, axis=0)
return pred_train, preds_test
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
test_x = pd.read_csv("./练习数据/ch03_practice_4_test.csv")
test_x = test_x.drop(["target"], axis=1)
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
model_1a = Model1Xgb()
pred_train_1a, pred_test_1a = predict_cv(model_1a, train_x, train_y, test_x)
model_1b = Model1NN()
pred_train_1b, pred_test_1b = predict_cv(model_1b, train_x, train_y, test_x)
print(f"logloss: {log_loss(train_y, pred_train_1a, eps=1e-7):.4f}")
print(f"logloss: {log_loss(train_y, pred_train_1b, eps=1e-7):.4f}")
train_x_2 = pd.DataFrame({"pred_1a": pred_train_1a, "pred_1b": pred_train_1b})
test_x_2 = pd.DataFrame({"pred_1a": pred_test_1a, "pred_1b": pred_test_1b})
model_2 = Model2Linear()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, train_y, test_x_2)
print(f"logloss: {log_loss(train_y, pred_train_2, eps=1e-7):.4f}")
Hold-out
# 3.8.2hold-out堆叠练习
kf = KFold(n_splits=4, shuffle=True, random_state=0)
tr_idx, va_index = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_index]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_index]
model_1a = Model1Xgb()
model_1a.fit(tr_x, tr_y, va_x, va_y)
va_pred_1a = model_1a.predict(va_x)
test_pred_1a = model_1a.predict(test_x)
model_1b = Model1NN()
model_1b.fit(tr_x, tr_y, va_x, va_y)
va_pred_1b = model_1b.predict(va_x)
test_pred_1b = model_1b.predict(test_x)
print(f"logloss: {log_loss(va_y, va_pred_1a, eps=1e-7):.4f}")
print(f"logloss: {log_loss(va_y, va_pred_1b, eps=1e-7):.4f}")
va_x_2 = pd.DataFrame({"pred_1a": va_pred_1a, "pred_1b": va_pred_1b})
test_x_2 = pd.DataFrame({"pred_1a": test_pred_1a, "pred_1b": test_pred_1b})
model2 = Model2Linear()
model2.fit(va_x_2, va_y, None, None)
pred_test_2 = model2.predict(test_x_2)
交叉验证
Stratified-fold
# 4.1交叉验证进阶stratified-fold
import numpy as np
import pandas as pd
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
test_x = pd.read_csv("./练习数据/ch03_practice_4_test.csv")
test_x = test_x.drop(["target"], axis=1)
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x, train_y):
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
Group-fold
# 交叉验证进阶group-fold,以‘user_id'为例
train_x["user_id"] = np.arange(0, len(train_x)) // 4
train_x
# GroupKFold练习
from sklearn.model_selection import GroupKFold, KFold
user_id = train_x["user_id"]
unique_user_ids = user_id.unique()
kf = GroupKFold(n_splits=4)
for tr_idx, va_idx in kf.split(train_x, train_y, user_id):
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
# group-fold无法设定shuffle和乱数种子,因此设计改进版如下
kf = KFold(n_splits=4, shuffle=True, random_state=0)
for tr_group_idx, va_group_idx in kf.split(unique_user_ids):
tr_groups, va_groups = unique_user_ids[tr_group_idx], unique_user_ids[va_group_idx]
is_tr = user_id.isin(tr_groups)
is_va = user_id.isin(va_groups)
tr_x, va_x = train_x[is_tr], train_x[is_va]
tr_y, va_y = train_y[is_tr], train_y[is_va]
# 4.2时间序列的交叉验证
import numpy as np
import pandas as pd
train = pd.read_csv("D:\\ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
test_x = pd.read_csv("D:\\ch03_practice_4_test.csv")
train_x["period"] = np.arange(0, len(train_x)) // (len(train_x) // 4)
train_x["period"] = np.clip(train_x["period"], 0, 3)
test_x["period"] = 4
train_x
直接划分
# 直接划分数据集,不使用交叉验证
is_tr = train_x["period"] < 3
is_va = train_x["period"] == 3
tr_x, va_x = train_x[is_tr], train_x[is_va]
tr_y, va_y = train_y[is_tr], train_y[is_va]
周期性交叉验证
# 全部存量数据的交叉验证,按周期
va_period_list = [1, 2, 3]
for va_period in va_period_list:
is_tr = train_x["period"] < va_period
is_va = train_x["period"] == va_period
tr_x, va_x = train_x[is_tr], train_x[is_va]
tr_y, va_y = train_y[is_tr], train_y[is_va]
按时间排序交叉验证
# 全部存量数据的交叉验证,按排序
from sklearn.model_selection import TimeSeriesSplit
tss = TimeSeriesSplit(n_splits=4)
for tr_idx, va_idx in tss.split(train_x):
print(tr_idx.min(), tr_idx.max(), va_idx.min(), va_idx.max())
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
窗口滑动等长交叉验证
# 窗口滑动等长交叉验证
from sktime.forecasting.model_selection import SlidingWindowSplitter
cv = SlidingWindowSplitter(window_length=12, fh=list(range(0, 8)), step_length=3)
for train_idx, test_idx in cv.split(train_x):
print(train_idx, test_idx)
tr_x, va_x = train_x.iloc[train_idx], train_x.iloc[test_idx]
tr_y, va_y = train_y.iloc[train_idx], train_y.iloc[test_idx]
弱先后顺序的交叉验证
# 弱先后顺序的交叉验证
va_period_list = [0, 1, 2, 3]
for va_period in va_period_list:
is_tr = train_x["period"] != va_period
is_va = train_x["period"] == va_period
tr_x, va_x = train_x[is_tr], train_x[is_va]
tr_y, va_y = train_y[is_tr], train_y[is_va]
分类问题最佳化阈值
二分类优化
# 4.3最佳化阈值练习
from scipy.optimize import minimize
from sklearn.metrics import f1_score
rand = np.random.RandomState()
train_y_prob = np.linspace(0, 1.0, 10000)
train_y = pd.Series(rand.uniform(0.0, 1.0, train_y_prob.size) < train_y_prob)
train_pred_prob = np.clip(
train_y_prob * np.exp(rand.standard_normal(train_y_prob.shape) * 0.3), 0.0, 1.0
)
init_threshold = 0.5
init_score = f1_score(train_y, train_pred_prob >= init_threshold)
print(init_threshold, init_score)
def f1_opt(x):
return -f1_score(train_y, train_pred_prob >= x)
result = minimize(f1_opt, x0=np.array([0.5]), method="Nelder-Mead")
best_threshold = result["x"].item()
best_score = f1_score(train_y, train_pred_prob >= best_threshold)
print(best_threshold, best_score)
Out-Of-Fold
# out-of-fold阈值优化
from scipy.optimize import minimize
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
rand = np.random.RandomState()
train_y_prob = np.linspace(0, 1.0, 10000)
train_y = pd.Series(rand.uniform(0.0, 1.0, train_y_prob.size) < train_y_prob)
train_pred_prob = np.clip(
train_y_prob * np.exp(rand.standard_normal(train_y_prob.shape) * 0.3), 0.0, 1.0
)
thresholds = []
scores_tr = []
scores_va = []
kf = KFold(n_splits=4, random_state=0, shuffle=True)
for i, (tr_idx, va_idx) in enumerate(kf.split(train_pred_prob)):
tr_pred_prob, va_pred_prob = train_pred_prob[tr_idx], train_pred_prob[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
def f1_opt(x):
return -f1_score(tr_y, tr_pred_prob >= x)
result = minimize(f1_opt, x0=np.array([0.5]), method="Nelder-Mead")
threshold = result["x"].item()
score_tr = f1_score(tr_y, tr_pred_prob >= threshold)
score_va = f1_score(va_y, va_pred_prob >= threshold)
print(threshold, score_tr, score_va)
thresholds.append(threshold)
scores_tr.append(score_tr)
scores_va.append(score_va)
threshold_test = np.mean(thresholds)
print(threshold_test)