import pandas as pd
import numpy as np
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
# 加载数据集
data = pd.read_csv('train_data.csv')
# 分割训练集和测试集
train, test = train_test_split(data, test_size=0.2, random_state=42)
# 定义目标变量和特征列
target_col = 'target'
cat_cols = ['cat_var1', 'cat_var2', 'cat_var3']
# 使用 Target Encoding 编码类别变量
encoder = TargetEncoder(cols=cat_cols)
encoder.fit(train[cat_cols], train[target_col])
train_encoded = encoder.transform(train[cat_cols])
test_encoded = encoder.transform(test[cat_cols])
# 将编码后的特征与原始数据集合并
train_merged = pd.concat([train.drop(cat_cols, axis=1), train_encoded], axis=1)
test_merged = pd.concat([test.drop(cat_cols, axis=1), test_encoded], axis=1)
# 训练逻辑回归模型并评估性能
lr = L
利用Target Encoding 处理类别变量的 Python 代码,告别类别变量处理的烦恼
于 2023-06-20 11:49:26 首次发布