import pandas as pd
from sklearn import datasets
iris = datasets. load_iris( )
data = iris. data
target = iris. target
feature_name = iris. feature_names
target_names = iris. target_names
data1 = pd. DataFrame( data, columns= feature_name)
print ( data1. head( 2 ) )
data2 = pd. read_csv( 'Narrativedata.csv' , index_col= [ 0 ] )
data2. dropna( inplace= True )
print ( data2. head( 2 ) )
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
Age Sex Embarked Survived
0 22.0 male S No
1 38.0 female C Yes
归一化
from sklearn. preprocessing import MinMaxScaler
scaler = MinMaxScaler( )
scaler = scaler. fit( data1)
result = scaler. transform( data1)
result[ : 5 ]
array([[0.22222222, 0.625 , 0.06779661, 0.04166667],
[0.16666667, 0.41666667, 0.06779661, 0.04166667],
[0.11111111, 0.5 , 0.05084746, 0.04166667],
[0.08333333, 0.45833333, 0.08474576, 0.04166667],
[0.19444444, 0.66666667, 0.06779661, 0.04166667]])
result_ = scaler. fit_transform( data)
result_[ : 5 ]
array([[0.22222222, 0.625 , 0.06779661, 0.04166667],
[0.16666667, 0.41666667, 0.06779661, 0.04166667],
[0.11111111, 0.5 , 0.05084746, 0.04166667],
[0.08333333, 0.45833333, 0.08474576, 0.04166667],
[0.19444444, 0.66666667, 0.06779661, 0.04166667]])
scaler. inverse_transform( result) [ : 5 ]
array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2]])
scaler = MinMaxScaler( feature_range= [ 5 , 10 ] )
result = scaler. fit_transform( data1)
result[ : 5 ]
array([[6.11111111, 8.125 , 5.33898305, 5.20833333],
[5.83333333, 7.08333333, 5.33898305, 5.20833333],
[5.55555556, 7.5 , 5.25423729, 5.20833333],
[5.41666667, 7.29166667, 5.42372881, 5.20833333],
[5.97222222, 8.33333333, 5.33898305, 5.20833333]])
scaler = scaler. partial_fit( data1)
scaler. transform( data1) [ : 5 ]
array([[6.11111111, 8.125 , 5.33898305, 5.20833333],
[5.83333333, 7.08333333, 5.33898305, 5.20833333],
[5.55555556, 7.5 , 5.25423729, 5.20833333],
[5.41666667, 7.29166667, 5.42372881, 5.20833333],
[5.97222222, 8.33333333, 5.33898305, 5.20833333]])
标准化
from sklearn. preprocessing import StandardScaler
scaler = StandardScaler( )
scaler. fit( data1)
x_std = scaler. transform( data1)
scaler. mean_
array([5.84333333, 3.05733333, 3.758 , 1.19933333])
scaler. var_
array([0.68112222, 0.18871289, 3.09550267, 0.57713289])
print ( 'mean: ' , x_std. mean( ) )
print ( 'std: ' , x_std. std( ) )
mean: -1.4684549872375404e-15
std: 1.0
scaler. fit_transform( data1) [ : 5 ]
array([[-0.90068117, 1.01900435, -1.34022653, -1.3154443 ],
[-1.14301691, -0.13197948, -1.34022653, -1.3154443 ],
[-1.38535265, 0.32841405, -1.39706395, -1.3154443 ],
[-1.50652052, 0.09821729, -1.2833891 , -1.3154443 ],
[-1.02184904, 1.24920112, -1.34022653, -1.3154443 ]])
scaler. inverse_transform( x_std) [ : 5 ]
array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2]])
标准化和归一化的选择
sklearn中的其他库
特征编码
离散型数据-针对标签label
from sklearn. preprocessing import LabelEncoder
y = data2. iloc[ : , - 1 ]
le = LabelEncoder( )
le = le. fit( y)
label = le. transform( y)
label[ : 3 ]
array([0, 2, 2])
label = le. fit_transform( y)
label[ : 3 ]
array([0, 2, 2])
le. classes_
array(['No', 'Unknown', 'Yes'], dtype=object)
le. inverse_transform( label) [ : 3 ]
array(['No', 'Yes', 'Yes'], dtype=object)
data2. iloc[ : , - 1 ] = label
data2. head( 3 )
Age Sex Embarked Survived 0 22.0 male S 0 1 38.0 female C 2 2 26.0 female S 2
离散型数据-针对特征
from sklearn. preprocessing import OrdinalEncoder
data_ = data2. copy( )
data_. head( 3 )
Age Sex Embarked Survived 0 22.0 male S 0 1 38.0 female C 2 2 26.0 female S 2
model = OrdinalEncoder( ) . fit( data_. iloc[ : , 1 : - 1 ] )
model. categories_
[array([0., 1.]), array([0., 1., 2.])]
data_. iloc[ : , 1 : - 1 ] = model. fit_transform( data_. iloc[ : , 1 : - 1 ] )
data_. head( 3 )
Age Sex Embarked Survived 0 22.0 1.0 2.0 0 1 38.0 0.0 0.0 2 2 26.0 0.0 2.0 2
独热编码,创建哑变量
data2. head( 3 )
Age Sex Embarked Survived 0 22.0 male S 0 1 38.0 female C 2 2 26.0 female S 2
from sklearn. preprocessing import OneHotEncoder
X = data2. iloc[ : , 1 : - 1 ]
model = OneHotEncoder( categories= 'auto' ) . fit( X)
result = model. transform( X) . toarray( )
result[ : 3 ]
array([[0., 1., 0., 0., 1.],
[1., 0., 1., 0., 0.],
[1., 0., 0., 0., 1.]])
OneHotEncoder( categories= 'auto' ) . fit_transform( X) . toarray( ) [ : 3 ]
array([[0., 1., 0., 0., 1.],
[1., 0., 1., 0., 0.],
[1., 0., 0., 0., 1.]])
pd. DataFrame( enc. inverse_transform( result) )
model. get_feature_names( )
result[ : 3 ]
array([[0., 1., 0., 0., 1.],
[1., 0., 1., 0., 0.],
[1., 0., 0., 0., 1.]])
result. shape
(712, 5)
newdata = pd. concat( [ data2, pd. DataFrame( result) ] , axis= 1 )
newdata. head( 3 )
Age Sex Embarked Survived 0 1 2 3 4 0 22.0 male S No 0.0 1.0 0.0 0.0 1.0 1 38.0 female C Yes 1.0 0.0 1.0 0.0 0.0 2 26.0 female S Yes 1.0 0.0 0.0 0.0 1.0
newdata. drop( [ "Sex" , "Embarked" ] , axis= 1 , inplace= True )
newdata. columns = [ "Age" , "Survived" , "Female" , "Male" , "Embarked_C" , "Embarked_Q" , "Embarked_S" ]
newdata. head( 3 )
Age Survived Female Male Embarked_C Embarked_Q Embarked_S 0 22.0 No 0.0 1.0 0.0 0.0 1.0 1 38.0 Yes 1.0 0.0 1.0 0.0 0.0 2 26.0 Yes 1.0 0.0 0.0 0.0 1.0
连续型数据-二分
data_2 = data2. copy( )
from sklearn. preprocessing import Binarizer
X = data_2. iloc[ : , 0 ] . values. reshape( - 1 , 1 )
transformer = Binarizer( threshold= 30 ) . fit_transform( X)
transformer[ : 3 ]
array([[0.],
[1.],
[0.]])
连续型数据-分箱
from sklearn. preprocessing import KBinsDiscretizer
X = data2. iloc[ : , 0 ] . values. reshape( - 1 , 1 )
est = KBinsDiscretizer( n_bins= 3 , encode= 'ordinal' , strategy= 'uniform' )
est. fit_transform( X)
set ( est. fit_transform( X) . ravel( ) )
est = KBinsDiscretizer( n_bins= 3 , encode= 'onehot' , strategy= 'uniform' )
est. fit_transform( X) . toarray( ) [ : 3 ]
array([[1., 0., 0.],
[0., 1., 0.],
[1., 0., 0.]])