这里用的是二值化的MNIST,同时将特征的值也二值化了。书上举的例子特征只有一维,但大多数情况下特征不会只有一维,这里每次都会遍历最优切分特征和最优切分点,弱分类器选择最简单的阈值分类器,对于每个弱分类器都有与其对应的切分特征和切分点,在预测的时候将预测数据也只使用需要的特征值即可。
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
import time
import math
import logging
import numpy as np
import pandas as pd
import pandas as pd
from sklearn.cross_validation import train_test_split
class Sign(object):
'''
阈值分类器
有两种方向,
1) x<v y=1
2) x>v y=1
因为这里的MNIST数据集已经二值化,所以v只有三个取值:{0,1,2}
'''
def __init__(self, features, labels, w):
self.X = features # 训练集,只有一个特征
self.Y = labels
self.N = len(labels)
self.w = w # 训练集的权值分布
self.indexes = [0, 1, 2] # v可取的值
def train_less_than(self):
'''
寻找最优切分点v
且寻找的是(x<v y=1)的最优切分点
'''
index = -1
error_score = 1000000
for i in self.indexes: # 遍历所有切分点
score = 0
for j in range(self.N): # 遍历所有特征值
val = -1
if self.X[j] < i: # train_less_than函数假设的是小于v标签为1
val = 1 # val为分类器的预测值
if val * self.Y[j] < 0: # 被误分类,所以加上权重
score += self.w[j]
if score < error_score:
index = i
error_score = score
return index, error_score
def train_more_than(self):
'''
寻找(x>v y=1)的最优切分点
'''
index = -1
error_score = 1000000
for i in self.indexes:
score = 0
for j in range(self.N):
val = 1
if self.X[j] < i:
val = -1
if val * self.Y[j] < 0:
score += self.w[j]
if score < error_score:
index = i
error_score = score
return index, error_score
def train(self):
less_index, less_score = self.train_less_than()
more_index, more_score = self.train_more_than()
if less_score < more_score:
self.is_less = True
self.index = less_index
return less_score
else:
self.is_less = False
self.index = more_index
return more_score
def predict(self, feature):
if self.is_less:
if feature < self.index:
return 1.0
else:
return -1.0
else:
if feature > self.index:
return 1.0
else:
return -1.0
class AdaBoost(object):
def __init__(self):
pass
def init_parameters(self, features, labels):
self.X = features
self.Y = labels
self.n = features.shape[1] # 特征数
self.N = features.shape[0] # 训练集大小
self.M = 10 # 分类器数目
self.w = [1.0 / self.N] * self.N # 训练集的权值
self.alpha = [] # 分类器的权重
self.classifier = [] # (特征下标,分类器),针对当前特征的分类器
def _w(self, index, classifier, i):
'''
公式(8.4),但不包括规范化因子Z
index是指当前弱分类器所选取的最优特征,用来计算弱分类器预测值的
'''
return self.w[i] * math.exp(-self.alpha[-1] * self.Y[i] * classifier.predict(self.X[i][index]))
def Z(self, index, classifier):
'''
公式(8.5)
'''
Z = 0
for i in range(self.N):
Z += self._w(index, classifier, i)
return Z
def train(self, features, labels):
self.init_parameters(features, labels)
for iteration in range(self.M): # 每次迭代选出最优的特征和分类器
logging.debug('iteration %d' % iteration)
# (误差率,针对的特征,分类器)
best_classifier = (100000, None, None)
for i in range(self.n): # 寻找最优特征
features = list(map(lambda x: x[i], self.X)) # 获得这个特征下所有特征值
classifier = Sign(features, self.Y, self.w)
error_score = classifier.train()
if error_score < best_classifier[0]:
best_classifier = (error_score, i, classifier)
em = best_classifier[0] # 最优分类误差率
if em == 0:
self.alpha.append(100)
else:
self.alpha.append(0.5 * math.log((1 - em) / em))
self.classifier.append(best_classifier[1:])
Z = self.Z(best_classifier[1], best_classifier[2])
# 计算新的训练集权值分布,(8.4)
for i in range(self.N):
self.w[i] = self._w(best_classifier[1], best_classifier[2], i) / Z
def _predict(self, feature):
result = 0.0
for i in range(self.M): # 遍历弱分类器
index = self.classifier[i][0] # 获取特征下标
classifier = self.classifier[i][1] # 获取弱分类器
result += self.alpha[i] * classifier.predict(feature[index])
if result > 0:
return 1
else:
return -1
def predict(self, features):
results = []
for feature in features:
results.append(self._predict(feature))
return results
# 将图片二值化
def binaryzation(image):
cv_img = []
for i in image:
if i > 0:
cv_img.append(1)
else:
cv_img.append(0)
return np.array(cv_img)
def binaryzation_features(train_set):
features = []
for img in train_set:
img = binaryzation(img)
features.append(img)
features = np.array(features)
features = features.reshape(-1, 784)
return features
if __name__ == '__main__':
logger = logging.getLogger()
logger.setLevel(logging.DEBUG)
print('Start reading data:')
time1 = time.time()
raw_data = pd.read_csv('data/train_binary.csv', header=0)
data = raw_data.values
imgs = data[:, 1:]
labels = data[:, 0]
# 首先将图片二值化
features = binaryzation_features(imgs)
# 1/2训练集,1/2测试集
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.5, random_state=0)
print(train_features.shape)
time2 = time.time()
print('read data cost %f seconds' % (time2 - time1))
print('Start training:')
# 将标签转化为1和-1,将lambda函数作用于train_labels
# Python3 map返回迭代器对象,所以要外加list()
train_labels = list(map(lambda x: 2 * x - 1, train_labels))
ada = AdaBoost()
ada.train(train_features, train_labels)
time3 = time.time()
print('training cost %f seconds' % (time3 - time2))
print('Start predicting:')
test_predict = ada.predict(test_features)
time4 = time.time()
print('predicting cost %f seconds' % (time4 - time3))
# 测试集标签也要转化为1和-1
test_labels = list(map(lambda x: 2 * x - 1, test_labels))
accuracy = sum([test_labels[i] == test_predict[i] for i in range(len(test_labels))]) / len(test_labels)
print("The accuracy is %f!" % accuracy)
'''
output:
Start reading data:
(21000, 784)
read data cost 17.040476 seconds
Start training:
DEBUG:root:iteration 0
DEBUG:root:iteration 1
...(运行时间过长,最终正确率可达98%以上)
'''