1. Preprocessing
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.datasets import make_blobs
from sklearn.datasets import make_circles
from sklearn.datasets import make_moons
from sklearn.svm import LinearSVC
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
X, y = make_classification(n_samples=20, n_features=5, n_classes=2)
for x_, y_ in zip(X, y):
print(y_, end=': ')
print(x_)
X, y = make_blobs(n_samples=100, n_features=2, centers=5)
plt.scatter(X[:, 0], X[:, 1], c=y)
plt.show()
X, y = make_circles(n_samples=1000, factor=0.5, noise=0.1)
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
plt.show()
X, y = make_moons(n_samples=1000, noise=0.1)
plt.scatter(X[:, 0], X[:, 1], marker='o', c=y)
plt.show()
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))
X_new = sel.fit_transform(X)
sel = SelectKBest(chi2, k=2)
X_new = sel.fit_transform(X, y)
LSVC = LinearSVC(C=0.01, penalty="l1", dual=False, max_iter=5000)
LSVC.fit(X, y)
model = SelectFromModel(LSVC, prefit=True)
X_new = model.transform(X)
ET = ExtraTreesClassifier()
ET = ET.fit(X, y)
model = SelectFromModel(ET, prefit=True)
X_new = model.transform(X)
X_scaled = preprocessing.scale(X)
scaler = preprocessing.StandardScaler()
scaler.fit(X)
X_scaled = scaler.transform(X)
scaler = preprocessing.MinMaxScaler(feature_range=(0, 1))
scaler.fit(X)
X_scaled = scaler.transform(X)
X_normalized = preprocessing.normalize(X, norm='l2')
normalizer = preprocessing.Normalizer()
normalizer.fit(X)
X_normalized = normalizer.transform(X)
binarizer = preprocessing.Binarizer(threshold=4)
binarizer.fit(X)
X_binarized = binarizer.transform(X)
encoder = preprocessing.OneHotEncoder()
encoder.fit(X)
X_encoded = encoder.transform(X).toarray()
X, y = load_digits(return_X_y=True)
X_train, x_test, y_train, y_test = train_test_split(X, y)
pca = PCA(n_components=0.95)
pca.fit(X_train, y_train)
X_train_reduced = pca.transform(X_train)
X_test_reduced = pca.transform(x_test)
LDA = LinearDiscriminantAnalysis(n_components=2)
LDA.fit(X_train, y_train)
X_train_reduced = LDA.transform(X_train)
X_test_reduced = LDA.transform(x_test)
2. Clustering
from sklearn import datasets
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN
from sklearn.cluster import AgglomerativeClustering
from sklearn.mixture import GaussianMixture
from sklearn.metrics import silhouette_score
iris = datasets.load_iris()
iris_X = iris.data
KM_model = KMeans(n_clusters=3)
KM_model.fit(iris_X)
labels = KM_model.labels_
print("K-Means SC = %.4s" % silhouette_score(iris_X, labels, metric='euclidean'))
DB_model = DBSCAN(eps=0.5, min_samples=5, metric='euclidean', metric_params=None,
algorithm='auto', leaf_size=30, p=None, n_jobs=1)
DB_model.fit(iris_X)
labels = DB_model.labels_
print("DBSCAN SC = %.4s" % silhouette_score(iris_X, labels, metric='euclidean'))
HC_model = AgglomerativeClustering(n_clusters=3)
HC_model.fit(iris_X)
labels = HC_model.labels_
print("Hierarchical Clustering SC = %.4s" % silhouette_score(iris_X, labels, metric='euclidean'))
GMM_model = GaussianMixture(n_components=3)
GMM_model.fit(iris_X)
labels = GMM_model.predict(iris_X)
print("GMM SC = %.4s" % silhouette_score(iris_X, labels, metric='euclidean'))
3. Classification
Models
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn import naive_bayes
from sklearn import tree
from sklearn.neural_network import MLPClassifier
digits = datasets.load_digits()
digits_X = digits.data
digits_y = digits.target
X_train, X_test, y_train, y_test = train_test_split(digits_X, digits_y, test_size=0.3, random_state=0)
LinearRegression_model = LinearRegression()
LinearRegression_model.fit(X_train, y_train)
print("LinearRegression acc = %.4s" % LinearRegression_model.score(X_test, y_test))
LR_model = LogisticRegression(max_iter=5000)
LR_model.fit(X_train, y_train)
print("LR acc = %.4s" % LR_model.score(X_test, y_test))
SGD_model = SGDClassifier(loss="hinge", penalty="l2")
SGD_model.fit(X_train, y_train)
print("SGD acc = %.4s" % SGD_model.score(X_test, y_test))
SVM_model = SVC(C=1.0, kernel='rbf', gamma='auto', decision_function_shape='ovo')
SVM_model.fit(X_train, y_train)
print("SVM acc = %.4s" % SVM_model.score(X_test, y_test))
kNN_model = KNeighborsClassifier(n_neighbors=5)
kNN_model.fit(X_train, y_train)
print("kNN acc = %.4s" % kNN_model.score(X_test, y_test))
NB_model = naive_bayes.MultinomialNB(alpha=1.0, fit_prior=True, class_prior=None)
NB_model.fit(X_train, y_train)
print("NB acc = %.4s" % NB_model.score(X_test, y_test))
DT_model = tree.DecisionTreeClassifier()
DT_model.fit(X_train, y_train)
print("DT acc = %.4s" % DT_model.score(X_test, y_test))
MLP_model = MLPClassifier(activation='relu', solver='adam', alpha=1e-5, hidden_layer_sizes=(5, 2), max_iter=5000)
MLP_model.fit(X_train, y_train)
print("MLP acc = %.4s" % MLP_model.score(X_test, y_test))
OVO & OVR
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.multiclass import OneVsOneClassifier
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.3)
log_reg = LogisticRegression()
log_reg1 = LogisticRegression(multi_class="multinomial", solver="newton-cg")
ovr_model = OneVsRestClassifier(log_reg)
ovr_model.fit(X_train, y_train)
print("OVR acc = %.4s" % ovr_model.score(X_test, y_test))
ovo_model = OneVsOneClassifier(log_reg1)
ovo_model.fit(X_train, y_train)
print("OVO acc = %.4s" % ovo_model.score(X_test, y_test))
Ensemble
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
digits = datasets.load_digits()
digits_X = digits.data
digits_y = digits.target
X_train, X_test, y_train, y_test = train_test_split(digits_X, digits_y, test_size=0.3, random_state=0)
bagging = BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
bagging.fit(X_train, y_train)
print("Bagging acc = %.4s" % bagging.score(X_test, y_test))
RF = RandomForestClassifier(n_estimators=10)
RF.fit(X_train, y_train)
print("RF acc = %.4s" % RF.score(X_test, y_test))
ET = ExtraTreesClassifier(n_estimators=10, max_depth=None, min_samples_split=2)
ET.fit(X_train, y_train)
print("ET acc = %.4s" % ET.score(X_test, y_test))
AdaBoost = AdaBoostClassifier(n_estimators=1000)
AdaBoost.fit(X_train, y_train)
print("AdaBoost acc = %.4s" % AdaBoost.score(X_test, y_test))
GBDT = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1)
GBDT.fit(X_train, y_train)
print("GBDT acc = %.4s" % GBDT.score(X_test, y_test))
HardVoting = VotingClassifier(estimators=[('bg', bagging), ('rf', RF), ('et', ET)], voting='hard')
for clf, label in zip([bagging, RF, ET, HardVoting], ['Bagging', 'Random Forest', 'Extra Trees', 'Ensemble']):
scores = cross_val_score(clf, X_test, y_test, cv=5, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
SoftVoting = VotingClassifier(estimators=[('bg', bagging), ('rf', RF), ('et', ET)], voting='soft', weights=[2, 1, 2])
for clf, label in zip([bagging, RF, ET, SoftVoting], ['Bagging', 'Random Forest', 'Extra Trees', 'Ensemble']):
scores = cross_val_score(clf, X_test, y_test, cv=5, scoring='accuracy')
print("Accuracy: %0.2f (+/- %0.2f) [%s]" % (scores.mean(), scores.std(), label))
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.3, random_state=0)
clf1 = LogisticRegression(random_state=1)
clf2 = RandomForestClassifier(random_state=1)
clf3 = GaussianNB()
eclf = VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft')
params = {'lr__C': [1.0, 100.0], 'rf__n_estimators': [20, 200]}
model = GridSearchCV(eclf, param_grid=params, cv=5)
model = model.fit(X_train, y_train)
print("Best Model: %s" % model.best_estimator_)
print("Best Score: %.4s" % model.best_score_)
print("Best Parameters: %s" % model.best_params_)
XGBoost
from sklearn import datasets
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
iris = datasets.load_iris()
iris_X = iris.data
iris_y = iris.target
X_train, X_test, y_train, y_test = train_test_split(iris_X, iris_y, test_size=0.3)
model = XGBClassifier()
eval_set = [(X_test, y_test)]
model.fit(X_train, y_train, early_stopping_rounds=10, eval_metric="mlogloss", eval_set=eval_set, verbose=True)
print(model.score(X_test, y_test))
4. Performance
Accuracy
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
digits = datasets.load_digits()
digits_X = digits.data
digits_y = digits.target
X_train, X_test, y_train, y_test = train_test_split(digits_X, digits_y, test_size=0.3, random_state=0)
LR_model = LogisticRegression(max_iter=5000)
LR_model.fit(X_train, y_train)
print("LR acc = %.4s" % LR_model.score(X_test, y_test))
y_pred = LR_model.predict(X_test)
predictions = [round(value) for value in y_pred]
acc = accuracy_score(y_test, predictions)
print("LR acc = %.4s" % acc)
Confusion Matrix
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import numpy as np
import matplotlib.pyplot as plt
def plot_confusion_matrix(cm, labels_name, title):
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
plt.imshow(cm, interpolation='nearest')
plt.title(title)
plt.colorbar()
num_local = np.array(range(len(labels_name)))
plt.xticks(num_local, labels_name, rotation=90)
plt.yticks(num_local, labels_name)
plt.ylabel('True label')
plt.xlabel('Predicted label')
X, y = make_classification(n_samples=500, n_features=10, n_informative=3, n_classes=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
LR_model = LogisticRegression(max_iter=5000)
LR_model.fit(X_train, y_train)
y_predict = LR_model.predict(X_test)
cm = confusion_matrix(y_test, y_predict)
print(cm)
plot_confusion_matrix(cm, ['0', '1'], "HAR Confusion Matrix")
plt.show()
TP = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TN = cm[1, 1]
Accuracy = (TP + TN) / (TP + FP + FN + TN)
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)
Specificity = TN / (TN + FP)
F1_score = 2 * Precision * Recall / (Precision + Recall)
G_mean = Recall * Specificity ** 0.5
print("Accuracy = %.4s" % Accuracy)
print("Precision = %.4s" % Precision)
print("Recall = %s.4" % Recall)
print("Specificity = %.4s" % Specificity)
print("F1_score = %.4s" % F1_score)
print("G_mean = %.4s" % G_mean)
ROC
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
import matplotlib.pyplot as plt
X, y = make_classification(n_samples=500, n_features=10, n_informative=3, n_classes=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
LR_model = LogisticRegression(max_iter=5000)
LR_model.fit(X_train, y_train)
y_predict = LR_model.predict(X_test)
y_score = LR_model.decision_function(X_test)
auc_score = roc_auc_score(y_test, y_score)
print("AUC = %.4s" % auc_score)
fpr, tpr, thresholds = roc_curve(y_test, y_score, pos_label=1)
auc_score_2 = auc(fpr, tpr)
print("AUC = %.4s" % auc_score_2)
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % auc_score)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
Cross Validation
from sklearn.datasets import load_iris
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.model_selection import validation_curve
iris = load_iris()
X = iris.data
y = iris.target
knn = KNeighborsClassifier(n_neighbors=5)
scores = cross_val_score(knn, X, y, scoring='accuracy', cv=5)
print("K-fold acc = %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
knn = KNeighborsClassifier()
params = {'n_neighbors': [1, 31]}
model = GridSearchCV(knn, param_grid=params, cv=5)
model = model.fit(X, y)
print("Best Model: %s" % model.best_estimator_)
print("Best Score: %.4s" % model.best_score_)
print("Best Parameters: %s" % model.best_params_)
knn = KNeighborsClassifier()
k_range = range(1, 31)
k_scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
k_scores.append(scores.mean())
plt.plot(k_range, k_scores)
plt.xlabel("Value of K for kNN")
plt.ylabel("Cross-validated Accuracy")
plt.show()
print("Grid Search acc = %.4s" % max(k_scores))
knn = KNeighborsClassifier(n_neighbors=3)
train_size, train_score, test_score = learning_curve(knn, X, y, cv=5, scoring='accuracy',
train_sizes=[0.1, 0.25, 0.5, 0.75, 1])
train_score_mean = train_score.mean(axis=1)
test_score_mean = test_score.mean(axis=1)
plt.plot(train_size, train_score_mean, 'ro-', label="Training")
plt.plot(train_size, test_score_mean, 'gs-', label="Cross-validation")
plt.xlabel("Training examples")
plt.ylabel("Cross-validated Accuracy")
plt.legend(loc="best")
plt.show()
param_range = [1, 33]
knn = KNeighborsClassifier()
train_score, test_score = validation_curve(knn, X, y, cv=5, scoring='accuracy',
param_name='n_neighbors', param_range=param_range)
train_score_mean = train_score.mean(axis=1)
test_score_mean = test_score.mean(axis=1)
plt.plot(param_range, train_score_mean, 'ro-', label="Training")
plt.plot(param_range, test_score_mean, 'gs-', label="Cross-validation")
plt.xlabel("Value of K for kNN")
plt.ylabel("Cross-validated Accuracy")
plt.legend(loc="best")
plt.show()
Timing
import time
start = time.time()
time.sleep(2)
end = time.time()
time_consumed = end-start
print('Running time: %.5s Seconds' % time_consumed)
5. Model Saving
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
import joblib
digits = datasets.load_digits()
digits_X = digits.data
digits_y = digits.target
X_train, X_test, y_train, y_test = train_test_split(digits_X, digits_y, test_size=0.3, random_state=0)
kNN_model = KNeighborsClassifier(n_neighbors=5)
kNN_model.fit(X_train, y_train)
joblib.dump(kNN_model, 'kNN.pickle')
model = joblib.load('kNN.pickle')