统计学习方法_AdaBoost算法实现

最新推荐文章于 2020-01-14 18:19:06 发布

Eminem1147

最新推荐文章于 2020-01-14 18:19:06 发布

阅读量357

点赞数 3

分类专栏：统计学习方法

本文链接：https://blog.csdn.net/qq_33765907/article/details/83655781

版权

统计学习方法专栏收录该内容

9 篇文章 1 订阅

订阅专栏

这里用的是二值化的MNIST，同时将特征的值也二值化了。书上举的例子特征只有一维，但大多数情况下特征不会只有一维，这里每次都会遍历最优切分特征和最优切分点，弱分类器选择最简单的阈值分类器，对于每个弱分类器都有与其对应的切分特征和切分点，在预测的时候将预测数据也只使用需要的特征值即可。

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import time
import math
import logging

import numpy as np
import pandas as pd
import pandas as pd

from sklearn.cross_validation import train_test_split

class Sign(object):
	'''
		阈值分类器
		有两种方向，
		1) x<v y=1
		2) x>v y=1
		因为这里的MNIST数据集已经二值化，所以v只有三个取值：{0,1,2}
	'''
	def __init__(self, features, labels, w):
		self.X = features  # 训练集，只有一个特征
		self.Y = labels
		self.N = len(labels)

		self.w = w  # 训练集的权值分布

		self.indexes = [0, 1, 2]  # v可取的值

	def train_less_than(self):
		'''
			寻找最优切分点v
			且寻找的是(x<v y=1)的最优切分点
		'''
		index = -1
		error_score = 1000000

		for i in self.indexes:  # 遍历所有切分点
			score = 0
			for j in range(self.N):  # 遍历所有特征值
				val = -1
				if self.X[j] < i:  # train_less_than函数假设的是小于v标签为1
					val = 1  # val为分类器的预测值
				if val * self.Y[j] < 0:  # 被误分类，所以加上权重
					score += self.w[j]
			if score < error_score:
				index = i
				error_score = score
		return index, error_score

	def train_more_than(self):
		'''
			寻找(x>v y=1)的最优切分点
		'''
		index = -1
		error_score = 1000000

		for i in self.indexes:
			score = 0
			for j in range(self.N):
				val = 1
				if self.X[j] < i:
					val = -1
				if val * self.Y[j] < 0:
					score += self.w[j]
			if score < error_score:
				index = i
				error_score = score
		return index, error_score

	def train(self):
		less_index, less_score = self.train_less_than()
		more_index, more_score = self.train_more_than()

		if less_score < more_score:
			self.is_less = True
			self.index = less_index
			return less_score
		else:
			self.is_less = False
			self.index = more_index
			return more_score

	def predict(self, feature):
		if self.is_less:
			if feature < self.index:
				return 1.0
			else:
				return -1.0
		else:
			if feature > self.index:
				return 1.0
			else:
				return -1.0

class AdaBoost(object):
	def __init__(self):
		pass

	def init_parameters(self, features, labels):
		self.X = features
		self.Y = labels

		self.n = features.shape[1]  # 特征数
		self.N = features.shape[0]  # 训练集大小
		self.M = 10  # 分类器数目

		self.w = [1.0 / self.N] * self.N  # 训练集的权值
		self.alpha = []  # 分类器的权重
		self.classifier = []  # (特征下标，分类器)，针对当前特征的分类器

	def _w(self, index, classifier, i):
		'''
			公式(8.4)，但不包括规范化因子Z
			index是指当前弱分类器所选取的最优特征，用来计算弱分类器预测值的
		'''
		return self.w[i] * math.exp(-self.alpha[-1] * self.Y[i] * classifier.predict(self.X[i][index]))

	def Z(self, index, classifier):
		'''
			公式(8.5)
		'''
		Z = 0
		for i in range(self.N):
			Z += self._w(index, classifier, i)
		return Z

	def train(self, features, labels):
		self.init_parameters(features, labels)

		for iteration in range(self.M):  # 每次迭代选出最优的特征和分类器
			logging.debug('iteration %d' % iteration)

			# (误差率，针对的特征，分类器)
			best_classifier = (100000, None, None)
			for i in range(self.n):  # 寻找最优特征
				features = list(map(lambda x: x[i], self.X))  # 获得这个特征下所有特征值
				classifier = Sign(features, self.Y, self.w)
				error_score = classifier.train()

				if error_score < best_classifier[0]:
					best_classifier = (error_score, i, classifier)

		em = best_classifier[0]  # 最优分类误差率
		if em == 0:
			self.alpha.append(100)
		else:
			self.alpha.append(0.5 * math.log((1 - em) / em))

		self.classifier.append(best_classifier[1:])

		Z = self.Z(best_classifier[1], best_classifier[2])

		# 计算新的训练集权值分布，(8.4)
		for i in range(self.N):
			self.w[i] = self._w(best_classifier[1], best_classifier[2], i) / Z

	def _predict(self, feature):
		result = 0.0
		for i in range(self.M):  # 遍历弱分类器
			index = self.classifier[i][0]  # 获取特征下标
			classifier = self.classifier[i][1]  # 获取弱分类器

			result += self.alpha[i] * classifier.predict(feature[index])

		if result > 0:
			return 1
		else:
			return -1

	def predict(self, features):
		results = []
		for feature in features:
			results.append(self._predict(feature))
		return results

# 将图片二值化
def binaryzation(image):
    cv_img = []
    for i in image:
        if i > 0:
            cv_img.append(1)
        else:
            cv_img.append(0)
    return np.array(cv_img)

def binaryzation_features(train_set):
	features = []

	for img in train_set:
		img = binaryzation(img)
		features.append(img)

	features = np.array(features)
	features = features.reshape(-1, 784)
	return features

if __name__ == '__main__':
	logger = logging.getLogger()
	logger.setLevel(logging.DEBUG)

	print('Start reading data:')
	time1 = time.time()

	raw_data = pd.read_csv('data/train_binary.csv', header=0)
	data = raw_data.values

	imgs = data[:, 1:]
	labels = data[:, 0]

	# 首先将图片二值化
	features = binaryzation_features(imgs)
	# 1/2训练集，1/2测试集
	train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size=0.5, random_state=0)

	print(train_features.shape)

	time2 = time.time()
	print('read data cost %f seconds' % (time2 - time1))

	print('Start training:')
	# 将标签转化为1和-1，将lambda函数作用于train_labels
	# Python3 map返回迭代器对象，所以要外加list()
	train_labels = list(map(lambda x: 2 * x - 1, train_labels))
	ada = AdaBoost()
	ada.train(train_features, train_labels)
	time3 = time.time()
	print('training cost %f seconds' % (time3 - time2))

	print('Start predicting:')
	test_predict = ada.predict(test_features)
	time4 = time.time()
	print('predicting cost %f seconds' % (time4 - time3))

	# 测试集标签也要转化为1和-1
	test_labels = list(map(lambda x: 2 * x - 1, test_labels))
	accuracy = sum([test_labels[i] == test_predict[i] for i in range(len(test_labels))]) / len(test_labels)
	print("The accuracy is %f!" % accuracy)

'''
output:
Start reading data:
(21000, 784)
read data cost 17.040476 seconds
Start training:
DEBUG:root:iteration 0
DEBUG:root:iteration 1
...（运行时间过长，最终正确率可达98%以上）
'''