sklearn数据预处理
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
1. 归一化 sklearn.preprocessing.MinMaxScaler
from sklearn.preprocessing import MinMaxScaler
# 归一化 默认范围[0,1]
scaler = MinMaxScaler(feature_range=[5, 10])
# 数据特征过多.fit()报错,用.partial_fit()
scaler = scaler.fit(data)
result = scaler.transform(data)
# 也可训练导出结果一步达成
# result = scaler.fit_transform(data)
result
# 逆转归一化回原数据
# scaler.inverse_transform(result)
'''
array([[ 5. , 5. ],
[ 6.25, 6.25],
[ 7.5 , 7.5 ],
[10. , 10. ]])
'''
2. 标准化 sklearn.preprocessing.StandardScaler
from sklearn.preprocessing import StandardScaler
# 标准化
scaler = StandardScaler()
scaler.fit(data)
print('两列均值:',scaler.mean_)
print('两列方差:',scaler.var_)
x_std = scaler.transform(data)
print('均值:',x_std.mean(),'方差:',x_std.std())
print(x_std)
'''
两列均值: [-0.125 9. ]
两列方差: [ 0.546875 35. ]
均值: 0.0 方差: 1.0
[[-1.18321596 -1.18321596]
[-0.50709255 -0.50709255]
[ 0.16903085 0.16903085]
[ 1.52127766 1.52127766]]
'''
3. 缺失值 sklearn.impute.SimpleImputer
查看数据
import pandas as pd
# index_col=0 第0列是索引
data = pd.read_csv('./data_preprocessing.csv', index_col=0)
data.info()
'''
<class 'pandas.core.frame.DataFrame'>
Int64Index: 9 entries, 0 to 8
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Age 7 non-null float64
1 Sex 9 non-null object
2 Embarked 9 non-null object
3 Survived 9 non-null object
dtypes: float64(1), object(3)
memory usage: 360.0+ bytes
'''
填补缺失值
# 填补缺失值
Age = data.loc[:,'Age'].values
# 不能是一维
Age = Age.reshape(-1, 1)
from sklearn.impute import SimpleImputer
# strategy 默认均值填补
imp_median = SimpleImputer(strategy='median')
imp_0 = SimpleImputer(strategy='constant', fill_value=0)
imp_median = imp_median.fit_transform(Age)
imp_0 = imp_0.fit_transform(Age)
print('中位数填补:',imp_median,
'\n',
'0填补:',imp_0
)
'''
中位数填补: [[22.]
[38.]
[26.]
[26.]
[35.]
[26.]
[58.]
[20.]
[ 2.]]
0填补: [[22.]
[38.]
[26.]
[ 0.]
[35.]
[ 0.]
[58.]
[20.]
[ 2.]]
'''
# 众数填补
Embarked = data.loc[:,'Embarked'].values.reshape(-1, 1)
imp_mode = SimpleImputer(strategy='most_frequent')
data.loc[:,'Embarked'] = imp_mode.fit_transform(Embarked)
pandas numpy简单的处理方法
# pandas numpy处理
# .fillna() Dataframe中直接填补
data.loc[:,'Age'].fillna(data.loc[:,'Age'].mean())
# axis=1 删除缺失值所在列
data.dropna(axis=0, inplace=False)
Age | Sex | Embarked | Survived | |
---|---|---|---|---|
0 | 22.0 | male | S | No |
1 | 38.0 | female | C | Yes |
2 | 26.0 | female | S | Yes |
4 | 35.0 | male | S | No |
6 | 58.0 | male | Q | No |
7 | 20.0 | female | C | Yes |
8 | 2.0 | female | S | Unknown |