整理了六种监督算法在同一项目上实现的方法。
朴素贝叶斯小项目——识别作者
参见文章 机器学习(1)——贝叶斯网络分类算法 最后一部分
SVM小项目——识别作者
参见 文章 机器学习——SVM
决策树小项目——识别作者
参见 文章 机器学习——决策树
KNN小项目——识别作者
#!/usr/bin/python
import matplotlib.pyplot as plt
from prep_terrain_data import makeTerrainData
from class_vis import prettyPicture
features_train, labels_train, features_test, labels_test = makeTerrainData()
### the training data (features_train, labels_train) have both "fast" and "slow"
### points mixed together--separate them so we can give them different colors
### in the scatterplot and identify them visually
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]
#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################
from time import time
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier()
t0 = time()
clf.fit(features_train, labels_train)
print "training time:", round(time()-t0, 3), "s"
t1 = time()
pred = clf.predict(features_test)
print "testing time:", round(time()-t1, 3), "s"
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, labels_test)
print acc
try:
prettyPicture(clf, features_test, labels_test)
except NameError:
pass
training time: 0.164 s
testing time: 0.012 s
0.92
Adaboost小项目——识别作者
#!/usr/bin/python
import matplotlib.pyplot as plt
from prep_terrain_data import makeTerrainData
from class_vis import prettyPicture
features_train, labels_train, features_test, labels_test = makeTerrainData()
### the training data (features_train, labels_train) have both "fast" and "slow"
### points mixed together--separate them so we can give them different colors
### in the scatterplot and identify them visually
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]
#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################
from time import time
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier()
t0 = time()
clf.fit(features_train, labels_train)
print "adaboost training time:", round(time()-t0, 3), "s"
t1 = time()
pred = clf.predict(features_test)
print "adaboost testing time:", round(time()-t1, 3), "s"
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, labels_test)
print "adaboost accuracy: ", acc
try:
prettyPicture(clf, features_test, labels_test)
except NameError:
pass
adaboost training time: 0.196 s
adaboost testing time: 0.019 s
adaboost accuracy: 0.924
RandomForest小项目——识别作者
#!/usr/bin/python
import matplotlib.pyplot as plt
from prep_terrain_data import makeTerrainData
from class_vis import prettyPicture
features_train, labels_train, features_test, labels_test = makeTerrainData()
### the training data (features_train, labels_train) have both "fast" and "slow"
### points mixed together--separate them so we can give them different colors
### in the scatterplot and identify them visually
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]
#### initial visualization
plt.xlim(0.0, 1.0)
plt.ylim(0.0, 1.0)
plt.scatter(bumpy_fast, grade_fast, color = "b", label="fast")
plt.scatter(grade_slow, bumpy_slow, color = "r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
################################################################################
from time import time
from sklearn.ensemble import RandomForestClassifier
import math
n_features = len(features_train[0])
clf = RandomForestClassifier(n_estimators=1)
t0 = time()
clf.fit(features_train, labels_train)
print "RandomForest training time:", round(time()-t0, 3), "s"
t1 = time()
pred = clf.predict(features_test)
print "RandomForest testing time:", round(time()-t1, 3), "s"
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, labels_test)
print "RandomForest accuracy: ", acc
try:
prettyPicture(clf, features_test, labels_test)
except NameError:
pass
RandomForest training time: 0.04 s
RandomForest testing time: 0.009 s
RandomForest accuracy: 0.92