python机器学习包sklearn数据预处理

最新推荐文章于 2024-06-09 19:37:22 发布

qq_27390023

最新推荐文章于 2024-06-09 19:37:22 发布

阅读量1k

点赞数 2

文章标签： python 机器学习

本文链接：https://blog.csdn.net/qq_27390023/article/details/125709631

版权

sklearn.preprocessing包提供了几个常见的实用函数和转换类，将原始特征向量改变为更适合下游机器学习的表示。

函数	功能
preprocessing.scale( )	标准化
preprocessing.MinMaxScaler( )	最大最小值标准化
preprocessing.StandardScaler( )	数据标准化
preprocessing.MaxAbsScaler( )	绝对值最大标准化
preprocessing.RobustScaler( )	带离群值数据集标准化
preprocessing.QuantileTransformer( )	使用分位数信息变换特征
preprocessing.PowerTransformer( )	使用幂变换执行到正态分布的映射
preprocessing.Normalizer( )	正则化
preprocessing.OrdinalEncoder( )	将分类特征转换为分类数值
preprocessing.LabelEncoder( )	将分类特征转换为分类数值
preprocessing.MultiLabelBinarizer( )	多标签二值化
preprocessing.OneHotEncoder( )	独热编码
preprocessing.KBinsDiscretizer( )	将连续数据离散化
preprocessing.FunctionTransformer( )	自定义特征处理函数
preprocessing.Binarizer( )	特征二值化
preprocessing.PolynomialFeatures( )	创建多项式特征
preprocesssing.Normalizer( )	正则化
preprocessing.Imputer( )	弥补缺失值

1.StandardScaler

# numpy: 1.21.1
# sklearn: 1.0.1

import numpy as np
from sklearn import preprocessing

## 1.StandardScaler， 
# z = (x - u) / s,：将数据转换为均值为0，方差为1的数据， 即标准正态分布的数据,数据维度<=2
# 注：沿着axis=0进行数据处理，数据维度要小于等于2

X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
scaler = preprocessing.StandardScaler().fit(X_train)
X_scaled = scaler.transform(X_train)
print(X_scaled.mean(axis=0),X_scaled.std(axis=0))

# 数据标准化用于分类预测示例
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

X, y = make_classification(random_state=0)
print(X,y)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
pipe = make_pipeline(StandardScaler(), LogisticRegression())
pipe.fit(X_train, y_train)  # apply scaling on training data
pipe.score(X_test, y_test)

2. MinMaxScaler

## 2. MinMaxScaler：数据转换到[0,1]
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])
min_max_scaler = preprocessing.MinMaxScaler()
X_train_minmax = min_max_scaler.fit_transform(X_train)
print(X_train_minmax)

# 用X_train的max，min转化测试数据，范围不一定位于[0,1]区间
X_test = np.array([[-3., -1.,  4.]])
X_test_minmax = min_max_scaler.transform(X_test)
print(X_test_minmax)

3.MaxAbsScaler

## 3.MaxAbsScaler：数据映射到[-1, 1]区间
X_train = np.array([[ 1., -1.,  2.],
                    [ 2.,  0.,  0.],
                    [ 0.,  1., -1.]])

max_abs_scaler = preprocessing.MaxAbsScaler()
X_train_maxabs = max_abs_scaler.fit_transform(X_train)
print(X_train_maxabs)
# 用X_train的scaler转化测试数据，范围不一定位于[-1, 1]区间
X_test = np.array([[ -3., -1.,  4.]])
X_test_maxabs = max_abs_scaler.transform(X_test)
#print(X_test_maxabs)
print(max_abs_scaler.scale_)

4.QuantileTransformer

## 4.QuantileTransformer,属于非线形变换
# map the data to a uniform distribution with values between 0 and 1:

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
X, y = load_iris(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
quantile_transformer = preprocessing.QuantileTransformer(random_state=0)
X_train_trans = quantile_transformer.fit_transform(X_train)
# print(X_train)
# print(X_train_trans)

X_test_trans = quantile_transformer.transform(X_test)
print(np.percentile(X_train[:, 1], [0, 25, 50, 75, 100]))
print(X_train.shape)
print(min(X_train[:, 1]))

5.Normalization

## 5.Normalization
# Normalization is the process of scaling individual samples to have unit norm. 
X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]
X_normalized = preprocessing.normalize(X, norm='l2')
print(X_normalized)

normalizer = preprocessing.Normalizer(norm='l2').fit(X)  # fit does nothing
X_normalized = normalizer.transform(X) 
print(X_normalized)

6.序数编码

## 6.序数编码：分类特征 to 整数 (0 to n_categories - 1)
enc = preprocessing.OrdinalEncoder()
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
enc.transform([['female', 'from US', 'uses Safari']])
X_transform = enc.transform(X)
print(X_transform)

# sklearn.impute: 缺失值处理模块
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
enc = Pipeline(steps=[("encoder", preprocessing.OrdinalEncoder()),
                      ("imputer", SimpleImputer(strategy="constant", fill_value=-1)),])
enc.fit_transform(X)

7.热独编码

##7.热独编码
enc = preprocessing.OneHotEncoder()
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
print(enc.categories_)
X_transform = enc.transform(X)
print("One hot")
print(X_transform)
print(X_transform.toarray())

genders = ['female', 'male']
locations = ['from Africa', 'from Asia', 'from Europe', 'from US']
browsers = ['uses Chrome', 'uses Firefox', 'uses IE', 'uses Safari']
enc = preprocessing.OneHotEncoder(categories=[genders, locations, browsers])
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
X_transform = enc.transform(X)
print(X_transform.toarray())
print(enc.transform([['male', 'from Asia', 'uses Firefox']]).toarray())

# 注：编码规则，genders前两位（01或10），locations：中间四位，browsers：后四位

enc = preprocessing.OneHotEncoder(handle_unknown='ignore')
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
print("missing categorical features")
x = enc.transform([['female', 'from Asia', 'uses Chrome']]).toarray()
print(x)
# handle_unknown='ignore':fit中没有的分类特征编码为全0

# 每种特征只有两个值，可以去掉第一位，1，0 编码
X = [['male', 'from US', 'uses Safari'],
     ['female', 'from Europe', 'uses Firefox']]
drop_enc = preprocessing.OneHotEncoder(drop='first').fit(X)
print(drop_enc.categories_)
print(drop_enc.transform(X).toarray())

# 对于大于2个值的特征，不要去除最高位
X = [['male', 'US', 'Safari'],
     ['female', 'Europe', 'Firefox'],
     ['female', 'Asia', 'Chrome']]
drop_enc = preprocessing.OneHotEncoder(drop='if_binary').fit(X)
print(drop_enc.categories_)

print(drop_enc.transform(X).toarray())

# 处理特征中的缺失值
drop_enc = preprocessing.OneHotEncoder(drop='if_binary',
                                       handle_unknown='ignore').fit(X)
X_test = [['male', 'Europe', 'IE']]
print("with unknown value")
print(drop_enc.transform(X_test).toarray())

# 缺失值当作一种特征值
X = [['male', 'Safari'],
     ['female', None],
     [np.nan, 'Firefox']]
enc = preprocessing.OneHotEncoder(handle_unknown='error').fit(X)
print(enc.categories_)
print(enc.transform(X).toarray())

# 热独编码映射到特征值
X = [['male', 'US', 'Safari'],
     ['female', 'Europe', 'Firefox'],
     ['female', 'Asia', 'Chrome']]
drop_enc = preprocessing.OneHotEncoder(drop='if_binary', sparse=False,
                                       handle_unknown='ignore').fit(X)
X_test = [['male', 'US', 'IE']]
X_trans = drop_enc.transform(X_test)
print(X_trans)
print(drop_enc.inverse_transform(X_trans))

8.PolynomialFeatures

## 8.PolynomialFeatures
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3, 2)
poly = PolynomialFeatures(2)
print(poly.fit_transform(X))

9.FunctionTransformer

## 9.FunctionTransformer：自定义转化
import numpy as np
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p, validate=True)
# log1p = log（x+1），数据取对数后，在一定程度上符合正态分布的特征
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)

参考：

https://zhuanlan.zhihu.com/p/393113910

https://scikit-learn.org/stable/modules/preprocessing.html#

qq_27390023

关注

2
点赞
踩
12

收藏

觉得还不错? 一键收藏
0
评论
python机器学习包sklearn数据预处理

sklearn.preprocessing包提供了几个常见的实用函数和转换类，将原始特征向量改变为更适合下游机器学习的表示。2. MinMaxScaler3.MaxAbsScaler4.QuantileTransformer5.Normalization6.序数编码7.热独编码8.PolynomialFeatures9.FunctionTransformer参考：https://zhuanlan.zhihu.com/p..
复制链接

扫一扫