import numpy as np
from sklearn import preprocessing
data = np.array([[ 3, -1.5, 2, -5.4],
[ 0, 4, -0.3, 2.1],
[ 1, 3.3, -1.9, -4.3]])
# mean removal
data_standardized = preprocessing.scale(data)
print "\nMean =", data_standardized.mean(axis=0)
print "Std deviation =", data_standardized.std(axis=0)
# min max scaling
data_scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
data_scaled = data_scaler.fit_transform(data)
print "\nMin max scaled data:\n", data_scaled
# normalization
data_normalized = preprocessing.normalize(data, norm='l1')
print "\nL1 normalized data:\n", data_normalized
# binarization
data_binarized = preprocessing.Binarizer(threshold=1.4).transform(data)
print "\nBinarized data:\n", data_binarized
# one hot encoding
encoder = preprocessing.OneHotEncoder()
encoder.fit([[0, 2, 1, 12], [1, 3, 5, 3], [2, 3, 2, 12], [1, 2, 4, 3]])
encoded_vector = encoder.transform([[2, 3, 5, 3]]).toarray()
print "\nEncoded vector:\n", encoded_vector
import numpy as np
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
input_classes = ['audi', 'ford', 'audi', 'toyota', 'ford', 'bmw']
label_encoder.fit(input_classes)
# print classes
print "\nClass mapping:"
for i, item in enumerate(label_encoder.classes_):
print item, '-->', i
# transform a set of classes
labels = ['toyota', 'ford', 'audi']
encoded_labels = label_encoder.transform(labels)
print "\nLabels =", labels
print "Encoded labels =", list(encoded_labels)
# inverse transform
encoded_labels = [2, 1, 0, 3, 1]
decoded_labels = label_encoder.inverse_transform(encoded_labels)
print "\nEncoded labels =", encoded_labels
print "Decoded labels =", list(decoded_labels)
import sys
import numpy as np
filename = sys.argv[1]
X = []
y = []
with open(filename, 'r') as f:
for line in f.readlines():
xt, yt = [float(i) for i in line.split(',')]
X.append(xt)
y.append(yt)
# Train/test split
num_training = int(0.8 * len(X))
num_test = len(X) - num_training
# Training data
X_train = np.array(X[:num_training]).reshape((num_training,1))
y_train = np.array(y[:num_training])
# Test data
X_test = np.array(X[num_training:]).reshape((num_test,1))
y_test = np.array(y[num_training:])
# Create linear regression object
from sklearn import linear_model
linear_regressor = linear_model.LinearRegression()
# Train the model using the training sets
linear_regressor.fit(X_train, y_train)
# Predict the output
y_test_pred = linear_regressor.predict(X_test)
# Plot outputs
import matplotlib.pyplot as plt
plt.scatter(X_test, y_test, color='green')
plt.plot(X_test, y_test_pred, color='black', linewidth=4)
plt.xticks(())
plt.yticks(())
plt.show()
# Measure performance
import sklearn.metrics as sm
print "Mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred), 2)
print "Mean squared error =", round(sm.mean_squared_error(y_test, y_test_pred), 2)
print "Median absolute error =", round(sm.median_absolute_error(y_test, y_test_pred), 2)
print "Explain variance score =", round(sm.explained_variance_score(y_test, y_test_pred), 2)
print "R2 score =", round(sm.r2_score(y_test, y_test_pred), 2)
# Model persistence
import cPickle as pickle
output_model_file = '3_model_linear_regr.pkl'
with open(output_model_file, 'w') as f:
pickle.dump(linear_regressor, f)
with open(output_model_file, 'r') as f:
model_linregr = pickle.load(f)
y_test_pred_new = model_linregr.predict(X_test)
print "\nNew mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred_new), 2)
import sys
import numpy as np
filename = sys.argv[1]
X = []
y = []
with open(filename, 'r') as f:
for line in f.readlines():
data = [float(i) for i in line.split(',')]
xt, yt = data[:-1], data[-1]
X.append(xt)
y.append(yt)
# Train/test split
num_training = int(0.8 * len(X))
num_test = len(X) - num_training
# Training data
#X_train = np.array(X[:num_training]).reshape((num_training,1))
X_train = np.array(X[:num_training])
y_train = np.array(y[:num_training])
# Test data
#X_test = np.array(X[num_training:]).reshape((num_test,1))
X_test = np.array(X[num_training:])
y_test = np.array(y[num_training:])
# Create linear regression object
from sklearn import linear_model
linear_regressor = linear_model.LinearRegression()
ridge_regressor = linear_model.Ridge(alpha=0.01, fit_intercept=True, max_iter=10000)
# Train the model using the training sets
linear_regressor.fit(X_train, y_train)
ridge_regressor.fit(X_train, y_train)
# Predict the output
y_test_pred = linear_regressor.predict(X_test)
y_test_pred_ridge = ridge_regressor.predict(X_test)
# Measure performance
import sklearn.metrics as sm
print "LINEAR:"
print "Mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred), 2)
print "Mean squared error =", round(sm.mean_squared_error(y_test, y_test_pred), 2)
print "Median absolute error =", round(sm.median_absolute_error(y_test, y_test_pred), 2)
print "Explained variance score =", round(sm.explained_variance_score(y_test, y_test_pred), 2)
print "R2 score =", round(sm.r2_score(y_test, y_test_pred), 2)
print "\nRIDGE:"
print "Mean absolute error =", round(sm.mean_absolute_error(y_test, y_test_pred_ridge), 2)
print "Mean squared error =", round(sm.mean_squared_error(y_test, y_test_pred_ridge), 2)
print "Median absolute error =", round(sm.median_absolute_error(y_test, y_test_pred_ridge), 2)
print "Explained variance score =", round(sm.explained_variance_score(y_test, y_test_pred_ridge), 2)
print "R2 score =", round(sm.r2_score(y_test, y_test_pred_ridge), 2)
# Polynomial regression
from sklearn.preprocessing import PolynomialFeatures
polynomial = PolynomialFeatures(degree=10)
X_train_transformed = polynomial.fit_transform(X_train)
datapoint = [0.39,2.78,7.11]
poly_datapoint = polynomial.fit_transform(datapoint)
poly_linear_model = linear_model.LinearRegression()
poly_linear_model.fit(X_train_transformed, y_train)
print "\nLinear regression:\n", linear_regressor.predict(datapoint)
print "\nPolynomial regression:\n", poly_linear_model.predict(poly_datapoint)
# Stochastic Gradient Descent regressor
sgd_regressor = linear_model.SGDRegressor(loss='huber', n_iter=50)
sgd_regressor.fit(X_train, y_train)
print "\nSGD regressor:\n", sgd_regressor.predict(datapoint)
import numpy as np
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn import datasets
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
def plot_feature_importances(feature_importances, title, feature_names):
# Normalize the importance values
feature_importances = 100.0 * (feature_importances / max(feature_importances))
# Sort the values and flip them
index_sorted = np.flipud(np.argsort(feature_importances)) # flipud 翻转矩阵
# Arrange the X ticks
pos = np.arange(index_sorted.shape[0]) + 0.5
# Plot the bar graph
plt.figure()
plt.bar(pos, feature_importances[index_sorted], align='center')
plt.xticks(pos, feature_names[index_sorted])
plt.ylabel('Relative Importance')
plt.title(title)
plt.show()
if __name__=='__main__':
# Load housing data
housing_data = datasets.load_boston()
# Shuffle the data
X, y = shuffle(housing_data.data, housing_data.target, random_state=7)
import sys
import csv
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error, explained_variance_score
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from housing import plot_feature_importances
def load_dataset(filename):
file_reader = csv.reader(open(filename, 'rb'), delimiter=',')
X, y = [], []
for row in file_reader:
X.append(row[2:13])
y.append(row[-1])
# Extract feature names
feature_names = np.array(X[0])
# Remove the first row because they are feature names
return np.array(X[1:]).astype(np.float32), np.array(y[1:]).astype(np.float32), feature_names
if __name__=='__main__':
# Load the dataset from the input file
X, y, feature_names = load_dataset(sys.argv[1])
X, y = shuffle(X, y, random_state=7)
# Split the data 80/20 (80% for training, 20% for testing)
num_training = int(0.9 * len(X))
X_train, y_train = X[:num_training], y[:num_training]
X_test, y_test = X[num_training:], y[num_training:]
# Fit Random Forest regression model
rf_regressor = RandomForestRegressor(n_estimators=1000, max_depth=10, min_samples_split=1)
rf_regressor.fit(X_train, y_train)
# Evaluate performance of Random Forest regressor
y_pred = rf_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
evs = explained_variance_score(y_test, y_pred)
print "\n#### Random Forest regressor performance ####"
print "Mean squared error =", round(mse, 2)
print "Explained variance score =", round(evs, 2)
# Plot relative feature importances
plot_feature_importances(rf_regressor.feature_importances_, 'Random Forest regressor', feature_names)
# Split the data 80/20 (80% for training, 20% for testing)
num_training = int(0.8 * len(X))
X_train, y_train = X[:num_training], y[:num_training]
X_test, y_test = X[num_training:], y[num_training:]
# Fit decision tree regression model
dt_regressor = DecisionTreeRegressor(max_depth=4)
dt_regressor.fit(X_train, y_train)
# Fit decision tree regression model with AdaBoost
ab_regressor = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), n_estimators=400, random_state=7)
ab_regressor.fit(X_train, y_train)
# Evaluate performance of Decision Tree regressor
y_pred_dt = dt_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred_dt)
evs = explained_variance_score(y_test, y_pred_dt)
print "\n#### Decision Tree performance ####"
print "Mean squared error =", round(mse, 2)
print "Explained variance score =", round(evs, 2)
# Evaluate performance of AdaBoost
y_pred_ab = ab_regressor.predict(X_test)
mse = mean_squared_error(y_test, y_pred_ab)
evs = explained_variance_score(y_test, y_pred_ab)
print "\n#### AdaBoost performance ####"
print "Mean squared error =", round(mse, 2)
print "Explained variance score =", round(evs, 2)
# Plot relative feature importances
plot_feature_importances(dt_regressor.feature_importances_,
'Decision Tree regressor', housing_data.feature_names)
plot_feature_importances(ab_regressor.feature_importances_,
'AdaBoost regressor', housing_data.feature_names)
import cPickle as pickle
output_model_file = 'saved_model.pkl'
with open(output_model_file, 'w') as f:
pickle.dump(linear_regressor, f)
with open(output_model_file, 'r') as f:
model_linregr = pickle.load(f)
y_test_pred_new = model_linregr.predict(X_test)
print "\nNew mean absolute error =", round(sm.mean_absolute_
error(y_test, y_test_pred_new), 2)
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
def plot_classifier(classifier, X, y):
# define ranges to plot the figure
x_min, x_max = min(X[:, 0]) - 1.0, max(X[:, 0]) + 1.0
y_min, y_max = min(X[:, 1]) - 1.0, max(X[:, 1]) + 1.0
# denotes the step size that will be used in the mesh grid
step_size = 0.01
# define the mesh grid
x_values, y_values = np.meshgrid(np.arange(x_min, x_max, step_size), np.arange(y_min, y_max, step_size)) # 注意此用法
# compute the classifier output
mesh_output = classifier.predict(np.c_[x_values.ravel(), y_values.ravel()])
# reshape the array
mesh_output = mesh_output.reshape(x_values.shape)
# Plot the output using a colored plot
plt.figure()
# choose a color scheme you can find all the options
# here: http://matplotlib.org/examples/color/colormaps_reference.html
plt.pcolormesh(x_values, y_values, mesh_output, cmap=plt.cm.gray)
# Overlay the training points on the plot
plt.scatter(X[:, 0], X[:, 1], c=y, s=80, edgecolors='black', linewidth=1, cmap=plt.cm.Paired)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from logistic_regression import plot_classifier
input_file = 'data_multivar.txt'
X = []
y = []
with open(input_file, 'r') as f:
for line in f.readlines():
data = [float(x) for x in line.split(',')]
X.append(data[:-1])
y.append(data[-1])
X = np.array(X)
y = np.array(y)
classifier_gaussiannb = GaussianNB()
classifier_gaussiannb.fit(X, y)
y_pred = classifier_gaussiannb.predict(X)
# compute accuracy of the classifier
accuracy = 100.0 * (y == y_pred).sum() / X.shape[0]
print "Accuracy of the classifier =", round(accuracy, 2), "%"
plot_classifier(classifier_gaussiannb, X, y)
###############################################
# Train test split
from sklearn import cross_validation
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=5)
classifier_gaussiannb_new = GaussianNB()
classifier_gaussiannb_new.fit(X_train, y_train)
y_test_pred = classifier_gaussiannb_new.predict(X_test)
# compute accuracy of the classifier
accuracy = 100.0 * (y_test == y_test_pred).sum() / X_test.shape[0]
print "Accuracy of the classifier =", round(accuracy, 2), "%"
plot_classifier(classifier_gaussiannb_new, X_test, y_test)
###############################################
# Cross validation and scoring functions
num_validations = 5
accuracy = cross_validation.cross_val_score(classifier_gaussiannb,
X, y, scoring='accuracy', cv=num_validations)
print "Accuracy: " + str(round(100*accuracy.mean(), 2)) + "%"
f1 = cross_validation.cross_val_score(classifier_gaussiannb,
X, y, scoring='f1_weighted', cv=num_validations)
print "F1: " + str(round(100*f1.mean(), 2)) + "%"
precision = cross_validation.cross_val_score(classifier_gaussiannb,
X, y, scoring='precision_weighted', cv=num_validations)
print "Precision: " + str(round(100*precision.mean(), 2)) + "%"
recall = cross_validation.cross_val_score(classifier_gaussiannb,
X, y, scoring='recall_weighted', cv=num_validations)
print "Recall: " + str(round(100*recall.mean(), 2)) + "%"
# specify the boundaries of the figure
plt.xlim(x_values.min(), x_values.max())
plt.ylim(y_values.min(), y_values.max())
# specify the ticks on the X and Y axes
plt.xticks((np.arange(int(min(X[:, 0])-1), int(max(X[:, 0])+1), 1.0)))
plt.yticks((np.arange(int(min(X[:, 1])-1), int(max(X[:, 1])+1), 1.0)))
plt.show()
if __name__=='__main__':
# input data
X = np.array([[4, 7], [3.5, 8], [3.1, 6.2], [0.5, 1], [1, 2], [1.2, 1.9], [6, 2], [5.7, 1.5], [5.4, 2.2]])
y = np.array([0, 0, 0, 1, 1, 1, 2, 2, 2])
# initialize the logistic regression classifier
classifier = linear_model.LogisticRegression(solver='liblinear', C=100)
# train the classifier
classifier.fit(X, y)
# draw datapoints and boundaries
plot_classifier(classifier, X, y)
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
# Show confusion matrix
def plot_confusion_matrix(confusion_mat):
plt.imshow(confusion_mat, interpolation='nearest', cmap=plt.cm.gray)
plt.title('Confusion matrix')
plt.colorbar()
tick_marks = np.arange(4)
plt.xticks(tick_marks, tick_marks)
plt.yticks(tick_marks, tick_marks)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
y_true = [1, 0, 0, 2, 1, 0, 3, 3, 3]
y_pred = [1, 1, 0, 2, 1, 0, 1, 3, 3]
confusion_mat = confusion_matrix(y_true, y_pred)
plot_confusion_matrix(confusion_mat)
# Print classification report
from sklearn.metrics import classification_report
target_names = ['Class-0', 'Class-1', 'Class-2', 'Class-3']
print classification_report(y_true, y_pred, target_names=target_names)
“相关推荐”对你有帮助么?
-
非常没帮助
-
没帮助
-
一般
-
有帮助
-
非常有帮助
提交