手写字体optdigits识别:
每一行代表一个手写字体图像,最大值为16,大小64,然后最后一列为该图片的标签值。
import numpy as np
from sklearn import svm
import matplotlib.colors
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.metrics import accuracy_score
import os
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from time import time
def show_acc(a, b, tip):
acc = a.ravel() == b.ravel()
print('%s acc :%.2f%%' % (tip, 100*np.mean(acc)))
def save_image(image, i):
# 由于optdigits数据集的像素最大是16,所以这里对其reshape
image *= 16.9
# 图像取反为了好观察
image = 255 - image
# 转化为图像的uint8格式
a = image.astype(np.uint8)
output_path = './/handwriting'
if not os.path.exists(output_path):
os.mkdir(output_path)
Image.fromarray(a).save(output_path + ('//%d.jpg' % i))
if __name__ == '__main__':
# 开始加载训练数据集
data = np.loadtxt('optdigits.tra', dtype=np.float, delimiter=',')
# 最后一列得到的是该手写字体图片的label
x, y = np.split(data, (-1,), axis=1)
# 64x64大小
images = x.reshape(-1, 8, 8)
y = y.ravel().astype(np.int)
# 加载测试数据集
data_test = np.loadtxt('optdigits.tes', dtype=np.float, delimiter=',')
x_test, y_test = np.split(data_test, (-1,), axis=1)
images_test = x_test.reshape(-1, 8, 8)
y_test = y_test.ravel().astype(np.int)
plt.figure(figsize=(15, 15), facecolor='w')
for index, image in enumerate(images[:16]):
plt.subplot(4, 8, index+1)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('trian image:%i' %y[index])
for index, image in enumerate(images_test[:16]):
plt.subplot(4, 8, index+17)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
save_image(image.copy(), index)
plt.title('test image:%i' %y[index])
plt.tight_layout(1.5)
plt.show()
params = {'C':np.logspace(0, 3, 7), 'gamma':np.logspace(-5, 0, 11)}
model = svm.SVC(C=10, kernel='rbf', gamma=0.001)
print('==============start training=================')
start = time()
model.fit(x, y)
end = time()
train_time = end - start
print('train time:%dseconds' % train_time)
y_hat = model.predict(x)
show_acc(y, y_hat, 'trian data')
y_hat_test = model.predict(x_test)
print('y_hat:\n', y_hat)
print('y_test:\n', y_test)
show_acc(y_test, y_hat_test, 'valiation data')
# 测试集里面错分的数据
# 测试集里面和预测值不同的图像
err_images = images_test[y_test != y_hat_test]
# 预测里面和测试不同的预测值
err_y_hat = y_hat_test[y_test != y_hat_test]
# 测试里面和预测不同的测试值
err_y = y_test[y_test != y_hat_test]
print('err_y_hat:\n', err_y_hat)
print('err_y:\n', err_y)
plt.figure(figsize=(15, 15), facecolor='w')
for index, image in enumerate(err_images):
if index >= 30:
break
plt.subplot(5, 6, index+1)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('error:%i, the real:%i' % (err_y_hat[index], err_y[index]))
plt.tight_layout(4)
plt.show()
接着我们更换训练方法,修改程序:
# model = svm.SVC(C=10, kernel='rbf', gamma=0.001)
model = GridSearchCV(svm.SVC(kernel='rbf'), param_grid=params, cv=3)
训练时间要长很多,但准确率并没有提升。。。。
接着我们使用经典的MNIST数据集来做实验:
import numpy as np
from sklearn import svm
import matplotlib.colors
import matplotlib.pyplot as plt
from PIL import Image
from sklearn.metrics import accuracy_score
import pandas as pd
import os
import csv
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from time import time
from pprint import pprint
import warnings
def show_acc(a, b, tip):
acc = a.ravel() == b.ravel()
print('%s acc :%.2f%%' % (tip, 100*np.mean(acc)))
def save_image(image, i):
# 图像取反为了好观察
image = 255 - image
# 转化为图像的uint8格式
a = image.astype(np.uint8)
output_path = './/handwriting'
if not os.path.exists(output_path):
os.mkdir(output_path)
Image.fromarray(a).save(output_path + ('//%d.jpg' % i))
def save_model(model):
data_test_hat = model.predict(data_test)
with open('Prediction.csv', 'wt') as f:
writer = csv.writer(f)
writer.writerow(['ImageId', 'Label'])
for i, d in enumerate(data_test_hat):
writer.writerow([i, d])
if __name__ == '__main__':
warnings.filterwarnings('ignore')
classifier_type = 'RF'
print('loading train data......')
start = time()
data = pd.read_csv('MNIST.train.csv', header=0, dtype=np.int)
print('loading finishing......')
# 读取标签值
y = data['label'].values
x = data.values[:, 1:]
print('the images numbers:%d, the pixs of images:%d' % (x.shape))
# reshape成28x28的格式,还原成原始的图像格式
images = x.reshape(-1, 28, 28)
y = y.ravel()
print(images)
print(y)
print('loading test data......')
start = time()
data_test = pd.read_csv('MNIST.test.csv', header=0, dtype=np.int)
data_test = data_test.values
images_test_result = data_test.reshape(-1, 28, 28)
print('data-test:\t', data_test)
print('images-test-result:\t', images_test_result)
print('loading finishing......')
np.random.seed(0)
x, x_test, y, y_test = train_test_split(x, y, train_size=0.8, random_state=1)
images = x.reshape(-1, 28, 28)
images_test = x_test.reshape(-1, 28, 28)
print('x-shape:\t', x.shape)
print('x-test-shape:\t', x_test.shape)
# 显示我们使用的部分训练数据和测试数据
plt.figure(figsize=(15, 9), facecolor='w')
for index, image in enumerate(images[:16]):
plt.subplot(4, 8, index+1)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('train data:%d' % (y[index]))
for index, image in enumerate(images_test_result[:16]):
plt.subplot(4, 8, index+17)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
save_image(image.copy(), index)
plt.title('test data')
plt.tight_layout()
plt.show()
if classifier_type == 'SVM':
model = svm.SVC(C=3000, kernel='rbf', gamma=1e-10)
print('让我们荡起小浆,开始训练吧.............')
t_start = time()
model.fit(x, y)
t_end = time()
print('train time:%.3f' % (t_end - t_start))
print('小船到岸,清下水......')
# print('最优分类器:', model.best_estimator_)
# print('最优参数:\t', model.best_params_)
# print('model.cv_results_ = \n', model.cv_results_)
t = time()
y_hat = model.predict(x)
t = time() - t
print('SVM训练集准确率:%.3f%%, 耗时:%.3f' %(accuracy_score(y, y_hat), t))
t = time()
y_hat_test = model.predict(x_test)
t = time() - t
print('SVM测试集准确率:%.3f%%, 耗时:%.3f' %(accuracy_score(y_test, y_hat_test), t))
save_model(model)
elif classifier_type == 'RF':
rfc = RandomForestClassifier(100, criterion='gini', min_samples_split=2, min_impurity_split=1e-10, bootstrap=True, oob_score=True)
print('让我们再次荡起小浆,开始训练吧.............')
t = time()
rfc.fit(x, y)
print('train time:%.3f' % (time() - t))
print('OOB准确率:%.3f%%' %(rfc.oob_score_*100))
print('小船到岸,清下水......')
t = time()
y_hat = rfc.predict(x)
t = time() - t
print('SVM训练集准确率:%.3f%%, 耗时:%.3f' %(accuracy_score(y, y_hat), t))
t = time()
y_hat_test = rfc.predict(x_test)
t = time() - t
print('SVM测试集准确率:%.3f%%, 耗时:%.3f' %(accuracy_score(y_test, y_hat_test), t))
save_model(rfc)
err = (y_test != y_hat_test)
err_images = images_test[err]
err_y_hat = y_hat_test[err]
err_y = y_test[err]
print('err_y_hat:\n', err_y_hat)
print('err_y:\n', err_y)
plt.figure(figsize=(15, 15), facecolor='w')
for index, image in enumerate(err_images):
if index >= 20:
break
plt.subplot(4, 5, index+1)
plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest')
plt.title('err:%i, real:%i' % (err_y_hat[index], err_y[index]))
plt.suptitle('Digital Handwriting recognition:Classifier--%s' % classifier_type, fontsize=15)
plt.tight_layout(rect=(0, 0, 1, 0.94))
plt.show()
相对来说,SVM和随机森林算法效果都已经不错,但随机森林表现的要好一点,分析可能是SVM还需要调参。