这里的主要问题是:什么样的数据需要做什么样的预处理?
答:比如数据归一化,在实际应用中,通过梯度下降求解的模型通常需要归一化,包括线性回归、逻辑回归、支持向量机、神经网络等。但决策树不必。
训练机器学习模型的四个步骤
1.数据预处理:特征工程 2.训练集和测试集 3.建立模型并用数据训练模型 4.评估模型
import numpy as np
from sklearn import preprocessing
data = np.array([[3 , -1.5 , 2 , -5.4 ],
[0 , 4 , -0.3 , 2.1 ],
[1 , 3.3 , -1.9 , -4.3 ]])
data_standardized = preprocessing.scale(data)
print("\n Mean =" , data_standardized.mean(axis=0 ))
print ("Std deviation =" , data_standardized.std(axis=0 ))
Mean = [ 5.55111512e-17 -1.11022302e-16 -7.40148683e-17 -7.40148683e-17]
Std deviation = [1. 1. 1. 1.]
data_scaler = preprocessing.MinMaxScaler(feature_range=(0 , 1 ))
data_scaled = data_scaler.fit_transform(data)
print("\nMin max scaled data =" ,data_scaled)
Min max scaled data = [[1. 0. 1. 0. ]
[0. 1. 0.41025641 1. ]
[0.33333333 0.87272727 0. 0.14666667]]
data_normalized = preprocessing.normalize(data, norm='l1' )
print( "\n L1 normalized data =" ,data_normalized)
L1 normalized data = [[ 0.25210084 -0.12605042 0.16806723 -0.45378151]
[ 0. 0.625 -0.046875 0.328125 ]
[ 0.0952381 0.31428571 -0.18095238 -0.40952381]]
data_binarized = preprocessing.binarize(X=data,threshold=1.4 )
print("\nBinarized data: \n" ,data_binarized)
Binarized data:
[[1. 0. 1. 0.]
[0. 1. 0. 1.]
[0. 1. 0. 0.]]
encoder = preprocessing.OneHotEncoder()
encoder.fit([[0 , 2 , 1 , 12 ], [1 , 3 , 5 , 3 ], [2 , 3 , 2 , 12 ],
[1 , 2 , 4 , 3 ]])
encoded_vector = encoder.transform([[2 ,3 ,5 ,3 ]]).toarray()
print("\nEncoded vector: \n" ,encoded_vector)
Encoded vector:
[[0. 0. 1. 0. 1. 0. 0. 0. 1. 1. 0.]]
label_encoder = preprocessing.LabelEncoder()
input_classes = ['audi' , 'ford' , 'audi' , 'toyota' , 'ford' , 'bmw' ]
label_encoder.fit(input_classes)
print("\n Class mapping:" )
for i ,item in enumerate(label_encoder.classes_):
print(item,'-->' ,i)
encoded_labels = [2 , 1 , 0 , 3 , 1 ]
decoded_labels = label_encoder.inverse_transform(encoded_labels)
print ("\nEncoded labels =" , encoded_labels)
print ("Decoded labels =" , list(decoded_labels))
Class mapping:
audi --> 0
bmw --> 1
ford --> 2
toyota --> 3
Encoded labels = [2, 1, 0, 3, 1]
Decoded labels = ['ford', 'bmw', 'audi', 'toyota', 'bmw']
C:\Users\dell\Anaconda3\lib\site-packages\sklearn\preprocessing\label.py:151: DeprecationWarning: The truth value of an empty array is ambiguous. Returning False, but in future this will result in an error. Use `array.size > 0` to check that an array is not empty.
if diff:
import sys
import os
filename = os.listdir()[5 ]
X = []
y = []
with open(filename, 'r' ) as f:
for line in f.readlines():
xt, yt = [float(i) for i in line.split(',' )]
X.append(xt)
y.append(yt)
num_training = int(0.8 * len(X))
num_test = len(X) - num_training
X_train = np.array(X[:num_training]).reshape((num_training,1 ))
y_train = np.array(y[:num_training])
X_test = np.array(X[num_training:]).reshape((num_test,1 ))
y_test = np.array(y[num_training:])
from sklearn import linear_model
linear_regressor = linear_model.LinearRegression()
linear_regressor.fit(X_train,y_train)
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)
import matplotlib.pyplot as plt
y_train_pred = linear_regressor.predict(X_train)
plt.figure()
plt.scatter(X_train,y_train,color='green' )
plt.plot(X_train, y_train_pred,color='black' ,linewidth=4 )
plt.title('Training data' )
plt.show()
y_test_pred = linear_regressor.predict(X_test)
import matplotlib.pyplot as plt
plt.scatter(X_test, y_test, color='green' )
plt.plot(X_test, y_test_pred, color='black' , linewidth=4 )
plt.xticks(())
plt.yticks(())
plt.show()
import sklearn.metrics as sm
print ("Mean absolute error =" , round(sm.mean_absolute_error(y_test, y_test_pred), 2 ))
print ("Mean squared error =" , round(sm.mean_squared_error(y_test, y_test_pred), 2 ))
print ("Median absolute error =" , round(sm.median_absolute_error(y_test, y_test_pred), 2 ) )
print ("Explain variance score =" , round(sm.explained_variance_score(y_test, y_test_pred), 2 ) )
print ("R2 score =" , round(sm.r2_score(y_test, y_test_pred), 2 ))
Mean absolute error = 0.54
Mean squared error = 0.38
Median absolute error = 0.54
Explain variance score = 0.68
R2 score = 0.68
import pickle
output_model_file = "saved_model.pkl"
with open(output_model_file,'wb' ) as f:
pickle.dump(linear_regressor, f)
with open(output_model_file,'rb' ) as f:
model_linregr =pickle.load(f)
y_test_pred_new = model_linregr.predict(X_test)
print ("\nNew mean absolute error =" ,
round(sm.mean_absolute_error(y_test, y_test_pred_new), 2 ))
New mean absolute error = 0.54
岭回归:线性回归模型会被异常值误差破坏,因此引入正则化项的系数作为阈值来消除异常值的影响。
from sklearn import linear_model
ridge_regressor = linear_model.Ridge(alpha=0.01 ,fit_intercept=True ,
max_iter=10000 )
ridge_regressor.fit(X_train, y_train)
y_test_pred_ridge = ridge_regressor.predict(X_test)
print("Mean absolute error =" ,round(sm.mean_absolute_error(y_test,
y_test_pred_ridge),2 ))
Mean absolute error = 0.54
from sklearn.preprocessing import PolynomialFeatures
polynomial = PolynomialFeatures(degree=10 )
X_train_transformed = polynomial.fit_transform(X_train)
X_train_transformed.shape
(40, 11)
datapoint = [[0.39 ],[2.78 ],[7.11 ]]
poly_datapoint = polynomial.fit_transform(datapoint)
poly_datapoint
array([[1.00000000e+00, 3.90000000e-01, 1.52100000e-01, 5.93190000e-02,
2.31344100e-02, 9.02241990e-03, 3.51874376e-03, 1.37231007e-03,
5.35200926e-04, 2.08728361e-04, 8.14040609e-05],
[1.00000000e+00, 2.78000000e+00, 7.72840000e+00, 2.14849520e+01,
5.97281666e+01, 1.66044303e+02, 4.61603162e+02, 1.28325679e+03,
3.56745388e+03, 9.91752179e+03, 2.75707106e+04],
[1.00000000e+00, 7.11000000e+00, 5.05521000e+01, 3.59425431e+02,
2.55551481e+03, 1.81697103e+04, 1.29186640e+05, 9.18517014e+05,
6.53065597e+06, 4.64329639e+07, 3.30138373e+08]])
poly_linear_model = linear_model.LinearRegression()
poly_linear_model.fit(X_train_transformed, y_train)
print("\nLinear regression: \n" ,linear_regressor.predict(datapoint))
print("\nPolynomial regression:\n" ,poly_linear_model.predict(poly_datapoint))
Linear regression:
[3.06649462 4.0038045 5.70194331]
Polynomial regression:
[ 3.14830911 3.7267279 83.2972363 ]
决策树回归器 adaboost算法:指自适应增强(adaptive boosting)算法,算法在每个阶段获取的信息都会反馈到模型中,这样学习器就可以在后一阶段重点训练难以分类的样本。
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn import datasets
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
housing_data = datasets.load_boston()
X,y = shuffle(housing_data.data, housing_data.target, random_state=7 )
num_training = int(0.8 * len(X))
X_train,y_train = X[:num_training], y[:num_training]
X_test, y_test = X[num_training:],y[num_training:]
dt_regressor = DecisionTreeRegressor(max_depth=4 )
dt_regressor.fit(X_train,y_train)
ab_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4 ),
n_estimators=400 ,random_state=7 )
ab_regressor.fit(X_train,y_train)
AdaBoostRegressor(base_estimator=DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
max_leaf_nodes=None, min_impurity_decrease=0.0,
min_impurity_split=None, min_samples_leaf=1,
min_samples_split=2, min_weight_fraction_leaf=0.0,
presort=False, random_state=None, splitter='best'),
learning_rate=1.0, loss='linear', n_estimators=400,
random_state=7)
y_pred_dt = dt_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred_dt)
evs = explained_variance_score(y_test, y_pred_dt)
print("Mean squared error =" ,round(mse,2 ))
print("Explained variance score =" , round(evs,2 ))
Mean squared error = 14.79
Explained variance score = 0.82
y_pred_ab = ab_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred_ab)
evs = explained_variance_score(y_test,y_pred_ab)
print("\n ### Adaboost performance #### " )
print("Mean squared error= " ,round(mse,2 ))
print("Explained variance score = " ,round(evs,2 ))
### Adaboost performance ####
Mean squared error= 7.64
Explained variance score = 0.91
计算数据特征的相对重要性
def plot_feature_importances (feature_importances,title,feature_names) :
feature_importances = 100.0 * (feature_importances / max(feature_importances))
index_sorted = np.flipud(np.argsort(feature_importances))
print("index_sorted" ,index_sorted)
pos = np.arange(index_sorted.shape[0 ]) + 0.5
print("pos" ,pos)
plt.figure()
plt.bar(pos, feature_importances[index_sorted], align='center' )
plt.xticks(pos, feature_names[index_sorted])
plt.ylabel("Relative Importance" )
plt.title(title)
plt.show()
plt.rcParams['figure.figsize' ] = (8.0 , 4.0 )
plt.rcParams['savefig.dpi' ] = 300
plt.rcParams['figure.dpi' ] = 300
plot_feature_importances(dt_regressor.feature_importances_,
'Decision Tree Regressor' ,housing_data.feature_names)
index_sorted [ 5 12 7 0 4 10 6 11 9 8 3 2 1]
pos [ 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5 11.5 12.5]
plot_feature_importances(ab_regressor.feature_importances_,
"AdaBoost regressor" ,housing_data.feature_names)
评估共享单车的需求分布
import csv
from sklearn.ensemble import RandomForestRegressor
filename = os.listdir()[1 ]
file_reader = csv.reader(open(filename,'rb' ),delimiter=',' )
file_reader
def load_dataset (filename) :
file_reader = csv.reader(open(filename,'r' ),delimiter=',' )
X, y = [], []
for row in file_reader:
X.append(row[2 :13 ])
y.append(row[-1 ])
feature_names = np.array(X[0 ])
return np.array(X[1 :]).astype(np.float32), np.array(y[1 :]).astype(np.float32),feature_names
X, y, feature_names = load_dataset(filename)
X, y = shuffle(X, y, random_state=7 )
num_training = int(0.9 * len(X))
X_train, y_train = X[:num_training], y[:num_training]
X_test, y_test = X[num_training:], y[num_training:]
rf_regressor = RandomForestRegressor(n_estimators=10000 ,
max_depth=10 ,min_samples_split=2 )
rf_regressor.fit(X_train,y_train)
RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=10,
max_features='auto', max_leaf_nodes=None,
min_impurity_decrease=0.0, min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0, n_estimators=10000, n_jobs=1,
oob_score=False, random_state=None, verbose=0, warm_start=False)
y_pred = rf_regressor.predict(X_test)
mse = mean_squared_error(y_test,y_pred)
evs = explained_variance_score(y_test, y_pred)
print("\n#### Random Forest regressor performance ####" )
print("Mean squared error = " ,round(mse,2 ))
print("Explained variance score =" ,round(evs,2 ))
#### Random Forest regressor performance ####
Mean squared error = 356360.61
Explained variance score = 0.89
plot_feature_importances(rf_regressor.feature_importances_,"Random Forest Regressor" ,feature_names)
index_sorted [ 7 1 8 0 9 10 2 6 4 5 3]
pos [ 0.5 1.5 2.5 3.5 4.5 5.5 6.5 7.5 8.5 9.5 10.5]