参考链接
提升方法理论推导:https://blog.csdn.net/ACM_hades/article/details
数据链接:https://github.com/WenDesi/lihang_book_algorithm/blob/master/data
代码
- 数据集:我们选择MNIST数据集进行实验,它包含各种手写数字(0-9)图片,图片大小28*28。MNIST数据集本身有10个类别,为了将其变成二分类问题我们进行如下处理:label等于0的继续等于0,label大于0改为1。这样就将十分类的数据改为二分类的数据。
- 特征选择:可选择的特征有很多,包括:
- 自己提取特征
- 将整个图片作为特征向量
- HOG特征
- 我们选择将整个图片作为特征(784=28×28)。
- 数据二值化:再进行模型训练是我们对数据进行了二值化,即将原理数据中特征的取值范围是{0,1,2,…,255}变成{0,1}
- 基本分类器:我们使用树状,即只有根节点的决策树。这里我们循环所有特征的所切点选择错误了率最低的切分点,作为当前的树桩分类器。
代码
# encoding=utf-8
import cv2
import time
import math
import logging
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 二值化
def binaryzation(img):
for i in range(len(img)):
img_1 = img[i] # 图片二值化
cv_img = img_1.astype(np.uint8) # 将图片的0-255取值变为0-1(整数)
cv2.threshold(cv_img, 50, 1, cv2.THRESH_BINARY_INV, cv_img)
img[i] = cv_img
class AdaBoost(object):
def __init__(self, features, labels):
self.X = features # 训练集特征
self.Y = labels # 训练集标签
self.n = len(features[0]) # 特征维度
self.N = len(features) # 训练集大小
self.M = 20 # 分类器数目
self.w = [1.0 / self.N] * self.N # 训练集的权值分布
self.alpha = [] # 分类器系数
self.classifier = [] # (维度,标记),针对当前维度的分类器
def create_basic_classifier(self, feature_index):
'''
树桩分类器
因为已经二值化后的MNIST数据集,所有特征值为{0,1}
所以只有四种情况:
1、属性值为0,类别为1,用0来标记
2、属性值为1,类别为1,用1来标记
3、所有数据类别都为1,用2来标记
4、所有数据类别都为-1,用3来标记
'''
features = self.X[:, feature_index]
error_list = []
predict_list = []
# 属性值为0,类别为1,的错误率
predict_list.append((features == 0).astype(np.int) * 2 - 1)
error_list.append(np.sum((predict_list[0] != self.Y).astype(np.float) * self.w))
# 属性值为1,类别为1,的错误率
predict_list.append(features.astype(np.int) * 2 - 1)
error_list.append(np.sum((predict_list[1] != self.Y).astype(np.float) * self.w) )
# 全1错误率
predict_list.append(np.array([1] * self.N))
error_list.append(np.sum((self.Y == -1).astype(np.float) * self.w))
# 全-1错误率
predict_list.append(np.array([-1] * self.N))
error_list.append(np.sum((self.Y == 1).astype(np.float) * self.w))
Error = 100000
Mark = -1
for i in range(len(error_list)):
if Error > error_list[i]:
Error = error_list[i]
Mark = i
#返回当前特征最好的预测结果是为了更新样本权重用
return Error, Mark, predict_list[Mark]
def train(self):
for k in range(self.M): # 迭代求基本分类器
best_classifier = (100000, None, None,None)
for i in range(self.n):#寻找当前情况下最好的树桩分类器
Error, Mark, predict = self.create_basic_classifier(i)
if Error < best_classifier[0]:
best_classifier = (Error, i, Mark, predict)
em = best_classifier[0]
# 计算系数
if em == 0:
self.alpha.append(100) # 用100表示无穷大
else:
self.alpha.append(0.5 * math.log((1 - em) / em))
self.classifier.append((best_classifier[1], best_classifier[2]))
#跟新样本权重
predict = best_classifier[-1]
self.w = self.w * np.exp(-1 * self.alpha[-1] * (self.Y * predict))
self.w = self.w / np.sum(self.w)
def _predict_(self, feature):
result = 0.0
for i in range(self.M):
index = self.classifier[i][0]
Mark = self.classifier[i][1]
predict = -1
if Mark==2 or(Mark == 0 and feature[index] == 0) or (Mark == 1 and feature[index] == 1):
predict=1
result+=self.alpha[i] *predict
if result > 0:
return 1
return -1
def predict(self, features):
results = []
for feature in features:
results.append(self._predict_(feature))
return results
if __name__ == '__main__':
print('Start read data')
S = time.time()
raw_data = pd.read_csv('../data/train_binary.csv') # 读取数据
data = raw_data.values # 获取数据
print("data shape:", data.shape)
imgs = data[:, 1:]
labels = data[:, 0]
binaryzation(imgs)
print("imgs shape:", imgs.shape)
print("labels shape:", labels.shape)
# 选取 2/3 数据作为训练集, 1/3 数据作为测试集
train_features, test_features, train_labels, test_labels = train_test_split(
imgs, labels, test_size=0.33, random_state=23323)
print("train data count :%d" % len(train_labels))
print("test data count :%d" % len(test_labels))
print('read data cost ', time.time() - S, ' second')
S = time.time()
print('Start training')
train_labels = train_labels * 2 - 1
ada = AdaBoost(train_features, train_labels)
ada.train()
print('training cost ', time.time() - S, ' second')
S = time.time()
print('Start predicting')
test_predict = ada.predict(test_features)
test_labels = test_labels*2-1
score = accuracy_score(test_labels, test_predict)
print('predicting cost ', time.time() - S, ' second')
print("The accruacy socre is ", score)
结果:
Start read data
data shape: (42000, 785)
imgs shape: (42000, 784)
labels shape: (42000,)
train data count :28140
test data count :13860
read data cost 4.342354774475098 second
Start training
training cost 63.095797061920166 second
Start predicting
predicting cost 0.2732689380645752 second
The accruacy socre is 0.9738816738816739