数据挖掘进阶

数据读取与深拷贝

import numpy as np
import pandas as pd
import warnings

warnings.filterwarnings("ignore")
train = pd.read_csv("./练习数据/ch03_practice_1.csv")
train
train_saved = train.copy()


def load_data():
    train = train_saved.copy()
    return train
train = load_data()

特征变化

标准化(-1,1)

# 1.1特征变化练习--标准化
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(train)
train_x = scaler.transform(train)
train_x

归一化(0,1)

# 特征变化练习-max-min归一化
train = load_data()
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
train_x = scaler.fit(train)
train_x = scaler.transform(train)
train_x

非线性变化

对数变换

# 1.2特征变化练习-非线性变化
x = np.array([1.0, 10.0, 100.0, 1000.0, 10000.0])
# 取对数
x1 = np.log(x)
# 加1后取对数
x2 = np.log1p(x)
# 对绝对值取对数并加上原本符号(正负)
x3 = np.sign(x) * np.log(np.abs(x))
import matplotlib.pyplot as plt
import seaborn as sns
# 对数变换
plt.subplot(1, 4, 1)
sns.distplot(x)
plt.subplot(1, 4, 2)
sns.distplot(x1)
plt.subplot(1, 4, 3)
sns.distplot(x2)
plt.subplot(1, 4, 4)
sns.distplot(x3)
plt.show()

box-cox变化

# box-cox变化
train = load_data()
cols = [c for c in train.columns.tolist() if (train[c] > 0.0).all()]
from sklearn.preprocessing import PowerTransformer

pt = PowerTransformer(method="box-cox")
pt.fit(train[cols])
train_x = pt.transform(train[cols])
plt.subplot(1, 4, 1)
sns.distplot(train[cols])
plt.subplot(1, 4, 2)
sns.distplot(train_x)
plt.show()

yeo-johnson变换

# yeo-johnson变换
train = load_data()
pt = PowerTransformer(method="yeo-johnson")
pt.fit(train[cols])
train_x = pt.transform(train[cols])
plt.subplot(1, 4, 1)
sns.distplot(train[cols])
plt.subplot(1, 4, 2)
sns.distplot(train_x)
plt.show()

特征编码

FeatureHasher

# 1.3特征编码-FeatureHasher练习
train = load_data()
from sklearn.feature_extraction import FeatureHasher

col = ["product", "year", "month", "day"]
for c in col:
    fh = FeatureHasher(n_features=4, input_type="string")
    hash_train = fh.transform(train[[c]].astype(str).values)
    hash_train = pd.DataFrame(
        hash_train.todense(), columns=[f"{c}_{i}" for i in range(4)]
    )
    train = pd.concat([train, hash_train], axis=1)
train

frequency encoding

# 特征编码-frequency encoding练习
train = load_data()
for c in col:
    freq = train[c].value_counts()
    train[c] = train[c].map(freq)
train

target encoding

# 特征编码-target encoding练习
train = pd.read_csv("./练习数据/ch03_practice_2.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
from sklearn.model_selection import KFold

# target encoding
for c in col:
    data_tmp = pd.DataFrame({c: train_x[c], "target": train_y})
    target_mean = data_tmp.groupby(c)["target"].mean()
    tmp = np.repeat(np.nan, train_x.shape[0])  # 通过repeat构造一个指定大小与数值的narray
    kf = KFold(n_splits=4, shuffle=True, random_state=0)
    for idx_1, idx_2 in kf.split(train_x):
        target_mean = data_tmp.iloc[idx_1].groupby(c)["target"].mean()
        tmp[idx_2] = train_x[c].iloc[idx_2].map(target_mean)
    train_x[c] = tmp
train_x

宽表格、长表格

# 1.4宽表格、长表格
df_time = pd.read_csv("./练习数据/ch03_practice_3.csv", index_col=0)
df_time.index = pd.to_datetime(df_time.index)
df_time
df_wide = df_time
df_long = df_wide.stack().reset_index(1)  # stack()将列索引转换为最内层的行索引
df_long.columns = ["id", "value"]
df_long
df_wide = df_long.pivot(
    index=None, columns="id", values="value"
)  # 重塑数据(产生一个“pivot”表格)以列值为标准
df_wide

特征构造

利用缺失值构造特征

# 2.2利用每一行缺失特征数构造特征以及根据是否缺失构造特征
train["nan_count"] = train.isnull().sum(axis=1)
train["year_nan"] = train.year.isnull().astype(int)
train

滑窗特征构造

# 2.4滑窗特征构造
df_time = pd.read_csv("./练习数据/ch03_practice_3.csv", index_col=0)
df_time.index = pd.to_datetime(df_time.index)
df_time_A = df_time[["A"]]
df_time_A_saved = df_time_A.copy()


def load_data():
    df_time_A = df_time_A_saved.copy()
    return df_time_A
x = load_data()
x_lag1 = x.shift(1)
x_lag7 = x.shift(7)
x["lag1"] = x_lag1
x["lag7"] = x_lag7
x
x = load_data()
x = df_time_A
x_avg3 = x.shift(1).rolling(window=3).mean()  # rolling()指定窗口大小的聚合函数
x_max7 = x.shift(1).rolling(window=7).max()
x_e7_avg = (x.shift(7) + x.shift(14) + x.shift(21)) / 3.0
x["avg3"] = x_avg3
x["max7"] = x_max7
x["e7_avg"] = x_e7_avg
x

无监督特征构造

# 2.5无监督特征构造
# pip install umap-learn
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(df_time)
df = scaler.transform(df_time)

UMAP算法

UMAP算法,该方法接受高于两个特征的数据集,输出低维图像用于探索数据集。相似的样本会趋于聚类在一起体现在输出的umap结果点图中。UMAP算法的目的是在低维图像上 展示高维空间中样本的簇以及样本点之间的联系。

import umap

um = umap.UMAP()
um.fit(df)
df_um = pd.DataFrame(um.transform(df), columns=[["um_1", "um_2"]], index=df_time.index)
df_um

Kmeans聚类

from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=3)
kmeans.fit(df_time)
df_clusters = kmeans.predict(df_time)
df_distances = kmeans.transform(df_time)  # 计算每个样本到类中心点的距离
df_distances = pd.DataFrame(
    df_distances,
    columns=["distance_1", "distance_2", "distance_3"],
    index=df_time.index,
)
df = pd.concat([df_time, df_distances], axis=1)
df

特征重要性

# 2.7特征重要性输出
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]

RandomForestClassifier

from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10)
clf.fit(train_x, train_y)
fi = clf.feature_importances_  # 返回一个narray,每个值对应特征的重要性
idx = np.argsort(fi)[::-1]
top_features, top_importances = train_x.columns.values[idx][:5], fi[idx][:5]
print("random forest importance")
print(top_features, top_importances)

xgboost

import xgboost as xgb

dtrain = xgb.DMatrix(train_x, label=train_y)
params = {"objective": "binary:logistic", "silent": 1}
num_round = 20
model = xgb.train(params, dtrain, num_round)
fscore = model.get_score(importance_type="total_gain")
fscore = sorted(
    [(k, v) for k, v in fscore.items()], key=lambda tpl: tpl[1], reverse=True
)
print("xgboost importance")
print(fscore[:5])

不断搜索

# 2.7 特征重要性输出(不断搜索)
from sklearn.model_selection import train_test_split

train_x, test_x, train_y, test_y = train_test_split(train_x, train_y)
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True)
# 构造训练集合与验证集
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
import xgboost as xgb
from sklearn.metrics import log_loss


def evaluate(features):
    dtrain = xgb.DMatrix(tr_x[features], label=tr_y)
    dvalid = xgb.DMatrix(va_x[features], label=va_y)
    params = {"objective": "binary:logistic", "silent": 1}
    num_round = 10
    early_stopping_rounds = 3
    watchlist = [(dtrain, "train"), (dvalid, "eval")]
    model = xgb.train(
        params,
        dtrain,
        num_round,
        evals=watchlist,
        early_stopping_rounds=early_stopping_rounds,
        verbose_eval=0,
    )
    va_pred = model.predict(dvalid)
    score = log_loss(va_y, va_pred)  # log_loss越小模型越好
    return score
best_score = 9999.0
candidates = np.random.RandomState(0).permutation(train_x.columns)  # 对列名进行随机排列
selected = set([])
for feature in candidates:
    fs = list(selected) + [feature]
    score = evaluate(fs)
    if score < best_score:
        selected.add(feature)
        best_score = score
        print(f"selected:{feature}")
        print(f"score:{score}")

print(f"selected features: {selected}")

模型调参(Xgboost)

贝叶斯优化

# 3.4 xgboost调参
# 贝叶斯寻优练习
class Model:
    def __init__(self, params=None):
        self.model = None
        if params is None:
            self.params = {}
        else:
            self.params = params

    def fit(self, tr_x, tr_y, va_x, va_y):
        params = {"objective": "binary:logistic", "silent": 1}
        params.update(self.params)
        num_round = 10
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)
        watchlist = [(dtrain, "train"), (dvalid, "eval")]
        self.model = xgb.train(params, dtrain, num_round, evals=watchlist)

    def predict(self, x):
        data = xgb.DMatrix(x)
        pred = self.model.predict(data)
        return pred
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
from sklearn.metrics import log_loss
def score(params):
    params["max_depth"] = int(params["max_depth"])
    model = Model(params)
    model.fit(tr_x, tr_y, va_x, va_y)
    va_pred = model.predict(va_x)
    score = log_loss(va_y, va_pred)
    print(f"params: {params}, logloss: {score:.4f}")
    history.append((params, score))
    return {"loss": score, "status": STATUS_OK}
space = {
    "min_child_weight": hp.quniform("min_child_weight", 1, 5, 1),
    "max_depth": hp.quniform("max_depth", 3, 9, 1),
    "gamma": hp.quniform("gamma", 0, 0.4, 0.1),
}
max_evals = 10
trials = Trials()
history = []
fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=max_evals)
history = sorted(history, key=lambda tpl: tpl[1])
best = history[0]
print(f"best params:{best[0]}, score:{best[1]:.4f}")

Xgboost参数空间

# 3.4.2Xgboost参数空间
params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "eta": 0.1,
    "gamma": 0.0,
    "alpha": 0.0,
    "lambda": 1.0,
    "min_child_weight": 1,
    "max_depth": 5,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "random_state": 0,
}

param_space = {
    "min_child_weight": hp.loguniform("min_child_weight", np.log(0.1), np.log(10)),
    "max_depth": hp.quniform("max_depth", 3, 9, 1),
    "subsample": hp.quniform("subsample", 0.6, 0.95, 0.05),
    "colsample_bytree": hp.quniform("colsample_bytree", 0.6, 0.95, 0.05),
    "gamma": hp.loguniform("gamma", np.log(1e-8), np.log(1.0))
    # 资源充裕时再做如下寻优
    # 'alpha' : hp.loguniform('alpha', np.log(1e-8), np.log(1.0)),
    # 'lambda' : hp.loguniform('lambda', np.log(1e-6), np.log(10.0))
}

神经网络调参

# 3.5神经网络调参
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True, random_state=0)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
import tensorflow as tf

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
from hyperopt import hp
from keras.callbacks import EarlyStopping
from keras.layers import BatchNormalization
from keras.layers.advanced_activations import PReLU, ReLU
from keras.layers.core import Dense, Dropout
from keras.models import Sequential
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import SGD, Adam
base_param = {
    "input_dropout": 0.0,
    "hidden_layers": 3,
    "hidden_units": 96,
    "hidden_activation": "relu",
    "hidden_dropout": 0.2,
    "batch_norm": "before_act",
    "optimizer": {"type": "adam", "lr": 0.001},
    "batch_size": 64,
}
param_space = {
    "input_dropout": hp.quniform("input_dropout", 0, 0.2, 0.05),
    "hidden_layers": hp.quniform("hidden_layers", 2, 4, 1),
    "hidden_units": hp.quniform("hidden_units", 32, 256, 32),
    "hidden_activation": hp.choice("hidden_activation", ["prelu", "relu"]),
    "hidden_dropout": hp.quniform("hidden_dropout", 0, 0.3, 0.05),
    "batch_norm": hp.choice("batch_norm", ["before_act", "no"]),
    "optimizer": hp.choice(
        "optimizer",
        [
            {
                "type": "adam",
                "lr": hp.loguniform("adam_lr", np.log(0.00001), np.log(0.01)),
            },
            {
                "type": "sgd",
                "lr": hp.loguniform("sgd_lr", np.log(0.00001), np.log(0.01)),
            },
        ],
    ),
    "batch_size": hp.quniform("batch_size", 32, 128, 32),
}
class MLP:
    def __init__(self, params):
        self.params = params
        self.scaler = None
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        input_dropout = self.params["input_dropout"]
        hidden_layers = int(self.params["hidden_layers"])
        hidden_units = int(self.params["hidden_units"])
        hidden_activation = self.params["hidden_activation"]
        hidden_dropout = self.params["hidden_dropout"]
        batch_norm = self.params["batch_norm"]
        optimizer_type = self.params["optimizer"]["type"]
        optimizer_lr = self.params["optimizer"]["lr"]
        batch_size = int(self.params["batch_size"])
        # 标准化
        self.scaler = StandardScaler()
        tr_x = self.scaler.fit_transform(tr_x)
        va_x = self.scaler.transform(va_x)
        self.model = Sequential()
        # 输入层
        self.model.add(Dropout(input_dropout, input_shape=(tr_x.shape[1],)))
        # 中间层
        for i in range(hidden_layers):
            self.model.add(Dense(hidden_units))
            if batch_norm == "before_act":
                self.model.add(BatchNormalization())
            if hidden_activation == "prelu":
                self.model.add(PReLU())
            elif hidden_activation == "relu":
                self.model.add(ReLU())
            else:
                raise NotImplementedError
            self.model.add(Dropout(hidden_dropout))
        # 输出层
        self.model.add(Dense(1, activation="sigmoid"))
        # 优化器
        if optimizer_type == "sgd":
            optimizer = SGD(lr=optimizer_lr, decay=1e-6, momentum=0.9, nesterov=True)
        elif optimizer_type == "adam":
            optimizer = Adam(lr=optimizer_lr, beta_1=0.9, beta_2=0.999, decay=0.0)
        else:
            raise NotImplementedError
        # 目标函数和评估指标
        self.model.compile(
            loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"]
        )
        nb_epoch = 200
        patience = 20
        early_stopping = EarlyStopping(patience=patience, restore_best_weights=True)
        history = self.model.fit(
            tr_x,
            tr_y,
            epochs=nb_epoch,
            batch_size=batch_size,
            verbose=1,
            validation_data=(va_x, va_y),
            callbacks=[early_stopping],
        )

    def predict(self, x):
        x = self.scaler.transform(x)
        y_pred = self.model.predict(x)
        y_pred = y_pred.flatten()
        return y_pred
def score(params):
    model = MLP(params)
    model.fit(tr_x, tr_y, va_x, va_y)
    va_pred = model.predict(va_x)
    score = log_loss(va_y, va_pred)
    print(f"params: {params}, logloss: {score:.4f}")
    history.append((params, score))
    return {"loss": score, "status": STATUS_OK}
max_evals = 10
trials = Trials()
history = []
fmin(score, param_space, algo=tpe.suggest, trials=trials, max_evals=max_evals)
history = sorted(history, key=lambda tpl: tpl[1])
best = history[0]
print(f"best params:{best[0]}, score:{best[1]:.4f}")

线性模型

# 3.6线性模型练习
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

LogisticRegression

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
tr_x = scaler.fit_transform(tr_x)
va_x = scaler.transform(va_x)
model = LogisticRegression()
model.fit(tr_x, tr_y)
va_pred = model.predict_proba(va_x)
score = log_loss(va_y, va_pred)
print(f"logloss: {score:.4f}")

自定义评估函数

# 3.7自定义评估函数
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
test_x = pd.read_csv("./练习数据/ch03_practice_4_test.csv")
test_x = test_x.drop(["target"], axis=1)
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
import xgboost as xgb
from sklearn.metrics import log_loss

dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
# 自定义目标函数和评估指标
def logregobj(preds, dtrain):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))
    grad = preds - labels
    hess = preds * (1.0 - preds)
    return grad, hess


def evalerror(preds, dtrain):
    labels = dtrain.get_label()
    return "custom-error", float(sum(labels != (preds > 0.0))) / len(labels)
params = {"silent": 1, "random_state": 0}
num_round = 50
watchlist = [(dtrain, "train"), (dvalid, "eval")]

bst = xgb.train(params, dtrain, num_round, watchlist, obj=logregobj, feval=evalerror)
pred_val = bst.predict(dvalid)
pred = 1.0 / (1.0 + np.exp(-pred_val))
logloss = log_loss(va_y, pred)
print(logloss)
params = {"silent": 1, "random_state": 0, "objective": "binary:logistic"}
bst = xgb.train(params, dtrain, num_round, watchlist)
pred = bst.predict(dvalid)
logloss = log_loss(va_y, pred)
print(logloss)
# xgboost类mae作为目标函数
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
from sklearn.model_selection import KFold

kf = KFold(n_splits=4, shuffle=True)
tr_idx, va_idx = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
dtrain = xgb.DMatrix(tr_x, label=tr_y)
dvalid = xgb.DMatrix(va_x, label=va_y)
# xgboost类mae作为目标函数
def fair(preds, dtrain):
    x = preds - dtrain.get_label()
    c = 1.0
    den = abs(x) + c
    grad = c * x / den
    hess = c * c / den ** 2
    return grad, hess
import numpy as np
import xgboost as xgb

param = {"max_depth": 3, "eta": 1, "silent": 1}
watchlist = [(dvalid, "eval"), (dtrain, "train")]
num_round = 15
bst = xgb.train(param, dtrain, num_round, watchlist, fair)

模型融合

Stacking

# 3.8.1 stacking练习,搭建基模型
import numpy as np
import pandas as pd
import xgboost as xgb
from keras.layers import Dense, Dropout
from keras.models import Sequential
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import os

os.environ["TF_CPP_MIN_LOG_LEVEL"] = "1"
import tensorflow as tf

tf.compat.v1.logging.set_verbosity(tf.compat.v1.logging.ERROR)
class Model1Xgb:
    def __init__(self):
        self.model = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        params = {
            "objective": "binary:logistic",
            "silent": 1,
            "random_state": 0,
            "eval_metric": "logloss",
        }
        num_round = 10
        dtrain = xgb.DMatrix(tr_x, label=tr_y)
        dvalid = xgb.DMatrix(va_x, label=va_y)
        watchlist = [(dtrain, "train"), (dvalid, "eval")]
        self.model = xgb.train(params, dtrain, num_round, evals=watchlist)

    def predict(self, x):
        data = xgb.DMatrix(x)
        pred = self.model.predict(data)
        return pred
class Model1NN:
    def __init__(self):
        self.model = None
        self.scaler = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.scaler = StandardScaler()
        self.scaler.fit(tr_x)
        batch_size = 128
        epochs = 10
        tr_x = self.scaler.transform(tr_x)
        va_x = self.scaler.transform(va_x)
        model = Sequential()
        model.add(Dense(256, activation="relu", input_shape=(tr_x.shape[1],)))
        model.add(Dropout(0.2))
        model.add(Dense(256, activation="relu"))
        model.add(Dropout(0.2))
        model.add(Dense(1, activation="sigmoid"))
        model.compile(loss="binary_crossentropy", optimizer="adam")
        history = model.fit(
            tr_x,
            tr_y,
            batch_size=batch_size,
            epochs=epochs,
            verbose=1,
            validation_data=(va_x, va_y),
        )
        self.model = model

    def predict(self, x):
        x = self.scaler.transform(x)
        pred = self.model.predict(x).reshape(-1)
        return pred
class Model2Linear:
    def __init__(self):
        self.model = None
        self.scaler = None

    def fit(self, tr_x, tr_y, va_x, va_y):
        self.scaler = StandardScaler()
        self.scaler.fit(tr_x)
        tr_x = self.scaler.transform(tr_x)
        self.model = LogisticRegression(solver="lbfgs")
        self.model.fit(tr_x, tr_y)

    def predict(self, x):
        x = self.scaler.transform(x)
        pred = self.model.predict_proba(x)[:, 1]
        return pred
def predict_cv(model, train_x, train_y, test_x):
    preds = []
    preds_test = []
    va_idxes = []
    kf = KFold(n_splits=4, shuffle=True, random_state=0)
    for i, (tr_idx, va_idx) in enumerate(kf.split(train_x)):
        tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
        tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
        model.fit(tr_x, tr_y, va_x, va_y)
        pred = model.predict(va_x)
        preds.append(pred)
        pred_test = model.predict(test_x)
        preds_test.append(pred_test)
        va_idxes.append(va_idx)
    va_idxes = np.concatenate(va_idxes)
    preds = np.concatenate(preds, axis=0)
    order = np.argsort(va_idxes)
    pred_train = preds[order]
    preds_test = np.mean(preds_test, axis=0)
    return pred_train, preds_test
train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
test_x = pd.read_csv("./练习数据/ch03_practice_4_test.csv")
test_x = test_x.drop(["target"], axis=1)
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
model_1a = Model1Xgb()
pred_train_1a, pred_test_1a = predict_cv(model_1a, train_x, train_y, test_x)
model_1b = Model1NN()
pred_train_1b, pred_test_1b = predict_cv(model_1b, train_x, train_y, test_x)
print(f"logloss: {log_loss(train_y, pred_train_1a, eps=1e-7):.4f}")
print(f"logloss: {log_loss(train_y, pred_train_1b, eps=1e-7):.4f}")
train_x_2 = pd.DataFrame({"pred_1a": pred_train_1a, "pred_1b": pred_train_1b})
test_x_2 = pd.DataFrame({"pred_1a": pred_test_1a, "pred_1b": pred_test_1b})
model_2 = Model2Linear()
pred_train_2, pred_test_2 = predict_cv(model_2, train_x_2, train_y, test_x_2)
print(f"logloss: {log_loss(train_y, pred_train_2, eps=1e-7):.4f}")

Hold-out

# 3.8.2hold-out堆叠练习
kf = KFold(n_splits=4, shuffle=True, random_state=0)
tr_idx, va_index = list(kf.split(train_x))[0]
tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_index]
tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_index]
model_1a = Model1Xgb()
model_1a.fit(tr_x, tr_y, va_x, va_y)
va_pred_1a = model_1a.predict(va_x)
test_pred_1a = model_1a.predict(test_x)
model_1b = Model1NN()
model_1b.fit(tr_x, tr_y, va_x, va_y)
va_pred_1b = model_1b.predict(va_x)
test_pred_1b = model_1b.predict(test_x)
print(f"logloss: {log_loss(va_y, va_pred_1a, eps=1e-7):.4f}")
print(f"logloss: {log_loss(va_y, va_pred_1b, eps=1e-7):.4f}")
va_x_2 = pd.DataFrame({"pred_1a": va_pred_1a, "pred_1b": va_pred_1b})
test_x_2 = pd.DataFrame({"pred_1a": test_pred_1a, "pred_1b": test_pred_1b})
model2 = Model2Linear()
model2.fit(va_x_2, va_y, None, None)
pred_test_2 = model2.predict(test_x_2)

交叉验证

Stratified-fold

# 4.1交叉验证进阶stratified-fold
import numpy as np
import pandas as pd

train = pd.read_csv("./练习数据/ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
test_x = pd.read_csv("./练习数据/ch03_practice_4_test.csv")
test_x = test_x.drop(["target"], axis=1)
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=4, shuffle=True, random_state=71)
for tr_idx, va_idx in kf.split(train_x, train_y):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

Group-fold

# 交叉验证进阶group-fold,以‘user_id'为例
train_x["user_id"] = np.arange(0, len(train_x)) // 4
train_x
# GroupKFold练习
from sklearn.model_selection import GroupKFold, KFold

user_id = train_x["user_id"]
unique_user_ids = user_id.unique()
kf = GroupKFold(n_splits=4)
for tr_idx, va_idx in kf.split(train_x, train_y, user_id):
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]
# group-fold无法设定shuffle和乱数种子,因此设计改进版如下
kf = KFold(n_splits=4, shuffle=True, random_state=0)
for tr_group_idx, va_group_idx in kf.split(unique_user_ids):
    tr_groups, va_groups = unique_user_ids[tr_group_idx], unique_user_ids[va_group_idx]
    is_tr = user_id.isin(tr_groups)
    is_va = user_id.isin(va_groups)
    tr_x, va_x = train_x[is_tr], train_x[is_va]
    tr_y, va_y = train_y[is_tr], train_y[is_va]
# 4.2时间序列的交叉验证
import numpy as np
import pandas as pd

train = pd.read_csv("D:\\ch03_practice_4.csv")
train_x = train.drop(["target"], axis=1)
train_y = train["target"]
test_x = pd.read_csv("D:\\ch03_practice_4_test.csv")
train_x["period"] = np.arange(0, len(train_x)) // (len(train_x) // 4)
train_x["period"] = np.clip(train_x["period"], 0, 3)
test_x["period"] = 4
train_x

直接划分

# 直接划分数据集,不使用交叉验证
is_tr = train_x["period"] < 3
is_va = train_x["period"] == 3
tr_x, va_x = train_x[is_tr], train_x[is_va]
tr_y, va_y = train_y[is_tr], train_y[is_va]

周期性交叉验证

# 全部存量数据的交叉验证,按周期
va_period_list = [1, 2, 3]
for va_period in va_period_list:
    is_tr = train_x["period"] < va_period
    is_va = train_x["period"] == va_period
    tr_x, va_x = train_x[is_tr], train_x[is_va]
    tr_y, va_y = train_y[is_tr], train_y[is_va]

按时间排序交叉验证

# 全部存量数据的交叉验证,按排序
from sklearn.model_selection import TimeSeriesSplit

tss = TimeSeriesSplit(n_splits=4)
for tr_idx, va_idx in tss.split(train_x):
    print(tr_idx.min(), tr_idx.max(), va_idx.min(), va_idx.max())
    tr_x, va_x = train_x.iloc[tr_idx], train_x.iloc[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

窗口滑动等长交叉验证

# 窗口滑动等长交叉验证
from sktime.forecasting.model_selection import SlidingWindowSplitter

cv = SlidingWindowSplitter(window_length=12, fh=list(range(0, 8)), step_length=3)
for train_idx, test_idx in cv.split(train_x):
    print(train_idx, test_idx)
    tr_x, va_x = train_x.iloc[train_idx], train_x.iloc[test_idx]
    tr_y, va_y = train_y.iloc[train_idx], train_y.iloc[test_idx]

弱先后顺序的交叉验证

# 弱先后顺序的交叉验证
va_period_list = [0, 1, 2, 3]
for va_period in va_period_list:
    is_tr = train_x["period"] != va_period
    is_va = train_x["period"] == va_period
    tr_x, va_x = train_x[is_tr], train_x[is_va]
    tr_y, va_y = train_y[is_tr], train_y[is_va]

分类问题最佳化阈值

二分类优化

# 4.3最佳化阈值练习
from scipy.optimize import minimize
from sklearn.metrics import f1_score

rand = np.random.RandomState()
train_y_prob = np.linspace(0, 1.0, 10000)
train_y = pd.Series(rand.uniform(0.0, 1.0, train_y_prob.size) < train_y_prob)
train_pred_prob = np.clip(
    train_y_prob * np.exp(rand.standard_normal(train_y_prob.shape) * 0.3), 0.0, 1.0
)
init_threshold = 0.5
init_score = f1_score(train_y, train_pred_prob >= init_threshold)
print(init_threshold, init_score)
def f1_opt(x):
    return -f1_score(train_y, train_pred_prob >= x)
result = minimize(f1_opt, x0=np.array([0.5]), method="Nelder-Mead")
best_threshold = result["x"].item()
best_score = f1_score(train_y, train_pred_prob >= best_threshold)
print(best_threshold, best_score)

Out-Of-Fold

# out-of-fold阈值优化
from scipy.optimize import minimize
from sklearn.metrics import f1_score
from sklearn.model_selection import KFold

rand = np.random.RandomState()
train_y_prob = np.linspace(0, 1.0, 10000)
train_y = pd.Series(rand.uniform(0.0, 1.0, train_y_prob.size) < train_y_prob)
train_pred_prob = np.clip(
    train_y_prob * np.exp(rand.standard_normal(train_y_prob.shape) * 0.3), 0.0, 1.0
)
thresholds = []
scores_tr = []
scores_va = []
kf = KFold(n_splits=4, random_state=0, shuffle=True)
for i, (tr_idx, va_idx) in enumerate(kf.split(train_pred_prob)):
    tr_pred_prob, va_pred_prob = train_pred_prob[tr_idx], train_pred_prob[va_idx]
    tr_y, va_y = train_y.iloc[tr_idx], train_y.iloc[va_idx]

    def f1_opt(x):
        return -f1_score(tr_y, tr_pred_prob >= x)

    result = minimize(f1_opt, x0=np.array([0.5]), method="Nelder-Mead")
    threshold = result["x"].item()
    score_tr = f1_score(tr_y, tr_pred_prob >= threshold)
    score_va = f1_score(va_y, va_pred_prob >= threshold)
    print(threshold, score_tr, score_va)
    thresholds.append(threshold)
    scores_tr.append(score_tr)
    scores_va.append(score_va)
threshold_test = np.mean(thresholds)
print(threshold_test)
  • 1
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

梦码城

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值