一、通过datasets获取数据
datasets ▪ 小型数据 — 存在于sklearn package中 — run datasets.load_* ▪ 大型数据 — must be fetched
boston = datasets.load _boston()
print(boston.DESCR ) #数据集简要描述
housing = datasets.fetch _california_housing() #fetch a dataset
print(housing.DESCR )
dataset 数据格式 Bunch
X , y = DataSet .data ,DataSet .target
using scikit-learn to create toy data
二、数据变换—归一化,二值化等
1. Scaling data to the standard normal
# Scale data to the standard normal
# Scale function
X_2 = preprocessing.scale (X [:,:3 ])
X_2.mean (axis=0 )
#array([ 6.34099712e-17, -6.34319123e-16, -2.68291099e-15])
X_2.std (axis=0 )
# array([ 1., 1., 1.])
# Scale Class
my_scaler = preprocessing.StandardScaler ()
my_scaler.fit (X [:,:3 ])
my_scaler.transform (X [:,:3 ]).mean (axis=0 )
# Min-max-Scaler 0-1
my_minmax_scaler = preprocessing.MinMaxScaler ()
my_minmax_scaler.fit (X [:,:3 ])
my_minmax_scaler.transform (X [:,:3 ]).max (axis=0 )
# Min max scaler to a range
my_odd_scaler = preprocessing.MinMaxScaler (feature_range=(-3.14 ,3.14 ))
my_odd_scaler.fit (X [:,:3 ])
my_odd_scaler.transform (X [:,:3 ]).max (axis=0 )
2.Normalization
normalized_X = preprocessing.normalize (X [:,:3 ])
my_useless_scaler = preprocessing.StandardScaler (with_mean=False,with_std=False)
transformed_sd = my_useless_scaler.fit _transform(X [:,:3 ]).std (axis=0 )
original_sd=X [:,:3 ].std (axis=0 )
np.array _equal(transformed_sd,original_sd)
3. #Create binary features through thresholding
new_target=preprocessing.binarize (boston.target ,
threshold=boston.target .mean ())
new_target[:5 ]
#check
(boston.target [:5 ] > boston.target .mean ()).astype (int)
# Using Binarizer class
bin = preprocessing.Binarizer (boston.target .mean ())
new_target = bin.fit _transform(boston.target )
new_target[:5 ]
三、处理类别变量
1. # Using Binarizer class
bin = preprocessing.Binarizer (boston.target .mean ())
new_target = bin.fit _transform(boston.target )
new_target[:5 ]
iris = datasets.load _iris()
X = iris.data
y = iris.target
#将X,y合并
d=np.column _stack((X ,y ))
text_encoder = preprocessing.OneHotEncoder ()
text_encoder.fit _transform(d[:,-1 :]).toarray ()[:5 ]
text_encoder.transform (np.ones ((3 ,1 ))).toarray ()
#array([[ 0., 1., 0.],
# [ 0., 1., 0.],
# [ 0., 1., 0.]])
2. #DictVectorizer
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()
my_dict = [{'species' :iris.target_names[i]} for i in y]
dv.fit_transform(my_dict).toarray()[:5 ]
# array([[ 1., 0., 0.],
# [ 1., 0., 0.],
# [ 1., 0., 0.],
# [ 1., 0., 0.],
# [ 1., 0., 0.]] )
3. Binarizing Label Features
target = iris.target
from sklearn.preprocessing import LabelBinarizer
label_binarizer = LabelBinarizer()
new_target = label_binarizer.fit_transform(target)
new_target.shape
new_target[:5 ]
# array([[1, 0, 0],
# [1, 0, 0],
# [1, 0, 0],
# [1, 0, 0],
# [1, 0, 0]])
label_binarizer.classes_
#array([0, 1, 2])
label_binarizer = LabelBinarizer(neg_label = -1000 , pos_label =1000 )
label_binarizer.fit_transform(target)[:5 ]
#array([[ 1000, -1000, -1000],
# [ 1000, -1000, -1000],
# [ 1000, -1000, -1000],
# [ 1000, -1000, -1000],
# [ 1000, -1000, -1000]])
四、处理缺失值
from sklearn import datasets
import numpy as np
iris=datasets.load_iris()
iris_X = iris.data
masking_array = np.random.binomial(1 ,.25 ,iris_X.shape).astype(bool)
iris_X[masking_array]=np.nan
from sklearn import preprocessing
impute = preprocessing.Imputer()
iris_X_prime = impute.fit_transform(iris_X)
iris_X_prime[:5 ]
iris_X_prime[3 ,0 ]
iris_X[3 ,0 ]
impute2 = preprocessing.Imputer(strategy='median' )
iris_X_prime2 = impute.fit_transform(iris_X)
iris_X_prime2[:5 ]
iris_X[np.isnan(iris_X)]=-1
iris_X[:5 ]
impute3 = preprocessing.Imputer(missing_values=-1 )
iris_X_prime = impute.fit_transform(iris_X)
iris_X_prime[:5 ]
import pandas as pd
iris_X[masking_array]=np.nan
iris_df = pd.DataFrame(iris_X,columns=iris.feature_names)
iris_df.fillna(iris_df.mean())['sepal length (cm)' ].head(5 )
五、Using pipeline for nultiple preprocessing steps
1. Pipeine
from sklearn import datasets
import numpy as np
mat = datasets.make_spd_matrix(10 )
masking_array = np.random.binomial(1 ,.1 ,mat.shape).astype(bool)
mat[masking_array]=np.nan
mat[:4 ,:4 ]
from sklearn import pipeline
impute = preprocessing.Imputer()
scaler = preprocessing.StandardScaler()
pipe = pipeline.Pipeline([('impute' ,impute),('scaler' ,scaler)])
new_mat = pipe.fit_transform(mat)
new_mat[:4 ,:4 ]
2. Pipelines functions
• Fit
• Transform
• fit_transform
六 、Reducing dimensionality with PCA
1. PCA
#降到2维
pca2 = decomposition.PCA (n_components=2 )
iris_X_prime = pca2.fit _transform(iris_X)
iris_X_prime.shape
# (150, 2)
pca2.explained _variance_ratio_.sum ()
#0.97763177502480336
# 以分数形式表示 eg 98%
pca3 = decomposition.PCA (n_components =.98 )
iris_X_prime2 = pca3.fit (iris_X)
pca3.explained _variance_ratio_.sum ()
#0.99481691454981014
2. Using Factor Analysis for decomposition
from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis (n_components=2 )
iris_two_dim = fa.fit_transform(iris.data )
iris_two_dim [:5 ]