变量转换器:YeoJohnsonTransformer
YeoJohnsonTransformer()将Yeo-Johnson变换应用于数值变量。
在这个演示中,我们使用Dean De Cock教授制作的Ames房价数据集:
Dean De Cock (2011) Ames, Iowa: Alternative to the Boston Housing
Data as an End of Semester Regression Project, Journal of Statistics Education, Vol.19, No. 3
http://jse.amstat.org/v19n3/decock.pdf
https://www.tandfonline.com/doi/abs/10.1080/10691898.2011.11889627
可以从Kaggle获取此笔记本中使用的数据集的版本。
# 导入所需的库
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# 导入特征工程库中的ArbitraryNumberImputer类和YeoJohnsonTransformer类
from feature_engine.imputation import ArbitraryNumberImputer
from feature_engine.transformation import YeoJohnsonTransformer
# 读取名为'houseprice.csv'的数据文件
data = pd.read_csv('houseprice.csv')
# 显示数据的前几行
data.head()
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | ... | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | ... | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
# 将数据分为训练集和测试集
# 使用train_test_split函数将数据分为训练集和测试集
# data.drop(['Id', 'SalePrice'], axis=1)表示去除数据中的'Id'和'SalePrice'列作为特征
# data['SalePrice']表示将'SalePrice'列作为目标变量
# test_size=0.3表示将数据集中的30%作为测试集
# random_state=0表示设置随机种子,保证每次运行代码时划分的训练集和测试集相同
X_train, X_test, y_train, y_test = train_test_split(
data.drop(['Id', 'SalePrice'], axis=1), data['SalePrice'], test_size=0.3, random_state=0)
# 输出训练集和测试集的形状
print(X_train.shape, X_test.shape)
((1022, 79), (438, 79))
# 初始化一个变量转换器,用于转换两个变量
yjt = YeoJohnsonTransformer(variables = ['LotArea', 'GrLivArea'])
# 寻找变换的最佳lambda值
yjt.fit(X_train)
YeoJohnsonTransformer(variables=['LotArea', 'GrLivArea'])
# 这些是YeoJohnson转换的lambda值字典
yjt.lambda_dict_
{'LotArea': 0.02258978732751055, 'GrLivArea': 0.06781061353154169}
# 将变量进行转换
# 对训练集进行转换
train_t = yjt.transform(X_train)
# 对测试集进行转换
test_t = yjt.transform(X_test)
# 给变量进行转换之前的直方图
X_train['GrLivArea'].hist(bins=50)
plt.title('变量转换之前')
plt.xlabel('GrLivArea')
Text(0.5, 0, 'GrLivArea')
# 绘制GrLivArea变量的直方图,将数据分成50个区间
train_t['GrLivArea'].hist(bins=50)
# 设置图表标题为"Transformed variable"
plt.title('Transformed variable')
# 设置x轴标签为"GrLivArea"
plt.xlabel('GrLivArea')
Text(0.5, 0, 'GrLivArea')
# 导入必要的库
import matplotlib.pyplot as plt
# 绘制X_train['LotArea']的直方图,将数据分为50个bins
X_train['LotArea'].hist(bins=50)
# 设置图表标题为'变换前的变量'
plt.title('变换前的变量')
# 设置x轴标签为'LotArea'
plt.xlabel('LotArea')
Text(0.5, 0, 'LotArea')
# 绘制训练集中变量"LotArea"的直方图,将数据分成50个区间
train_t['LotArea'].hist(bins=50)
# 设置直方图的标题为"变换前的变量"
plt.title('Variable before transformation')
# 设置x轴标签为"LotArea"
plt.xlabel('LotArea')
Text(0.5, 0, 'LotArea')
自动选择数值变量
在使用YeoJohnsonTransformer之前,我们需要确保数值变量没有缺失数据。
# 创建一个ArbitraryNumberImputer对象,将缺失值替换为2
arbitrary_imputer = ArbitraryNumberImputer(arbitrary_number=2)
# 使用训练数据拟合imputer对象
arbitrary_imputer.fit(X_train)
# 使用imputer对象对训练数据进行转换,将缺失值替换为2
train_t = arbitrary_imputer.transform(X_train)
# 使用imputer对象对测试数据进行转换,将缺失值替换为2
test_t = arbitrary_imputer.transform(X_test)
# 初始化一个Yeo-Johnson变换器对象,用于对所有变量进行变换
yjt = YeoJohnsonTransformer()
# 对训练数据进行拟合,以学习变换参数
yjt.fit(train_t)
C:\Users\Sole\Documents\Repositories\envs\fenotebook\lib\site-packages\numpy\core\_methods.py:205: RuntimeWarning: overflow encountered in multiply
x = um.multiply(x, x, out=x)
C:\Users\Sole\Documents\Repositories\envs\fenotebook\lib\site-packages\scipy\optimize\optimize.py:2149: RuntimeWarning: invalid value encountered in double_scalars
tmp2 = (x - v) * (fx - fw)
YeoJohnsonTransformer()
注意,运行时错误是因为我们试图转换整数。
# 定义将被转换的变量
# (这些是数据集中的数值变量)
yjt.variables_
['MSSubClass',
'LotFrontage',
'LotArea',
'OverallQual',
'OverallCond',
'YearBuilt',
'YearRemodAdd',
'MasVnrArea',
'BsmtFinSF1',
'BsmtFinSF2',
'BsmtUnfSF',
'TotalBsmtSF',
'1stFlrSF',
'2ndFlrSF',
'LowQualFinSF',
'GrLivArea',
'BsmtFullBath',
'BsmtHalfBath',
'FullBath',
'HalfBath',
'BedroomAbvGr',
'KitchenAbvGr',
'TotRmsAbvGrd',
'Fireplaces',
'GarageYrBlt',
'GarageCars',
'GarageArea',
'WoodDeckSF',
'OpenPorchSF',
'EnclosedPorch',
'3SsnPorch',
'ScreenPorch',
'PoolArea',
'MiscVal',
'MoSold',
'YrSold']
# 这些是YeoJohnsonTransformer的参数
yjt.lambda_dict_
{'MSSubClass': -0.2378306859381657,
'LotFrontage': 0.8125118353085222,
'LotArea': 0.02258978732751055,
'OverallQual': 0.8643396718133388,
'OverallCond': 0.3969558337988488,
'YearBuilt': 22.190602427177428,
'YearRemodAdd': 39.78218820691653,
'MasVnrArea': -0.25239760257649463,
'BsmtFinSF1': 0.2366258546377551,
'BsmtFinSF2': -1.4342285901348042,
'BsmtUnfSF': 0.48006245124988,
'TotalBsmtSF': 0.7956604221557544,
'1stFlrSF': 0.02379301178012459,
'2ndFlrSF': -0.11762285496291223,
'LowQualFinSF': -9.792554984978894,
'GrLivArea': 0.06781061353154169,
'BsmtFullBath': -1.6227232190570056,
'BsmtHalfBath': -25.604493651416906,
'FullBath': 0.7799203671146958,
'HalfBath': -2.204536108690803,
'BedroomAbvGr': 0.8360057761170219,
'KitchenAbvGr': -1.594528611751712,
'TotRmsAbvGrd': 0.13886712654681466,
'Fireplaces': -0.3289777349413731,
'GarageYrBlt': 2.844211228808113,
'GarageCars': 1.3847317293866424,
'GarageArea': 0.8241205521050128,
'WoodDeckSF': -0.09933941199949706,
'OpenPorchSF': 0.00882850937230263,
'EnclosedPorch': -1.3484255768857674,
'3SsnPorch': -11.660518913094377,
'ScreenPorch': -2.30942436747328,
'PoolArea': -53.219699359215674,
'MiscVal': -3.612316196644568,
'MoSold': 0.723241223747494,
'YrSold': -2.72135950034568}
# 将变量进行转换
train_t = yjt.transform(train_t) # 对训练集进行转换
test_t = yjt.transform(test_t) # 对测试集进行转换