代码实现:AdaBoost
from numpy import *
import numpy as np
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import time
from numpy import *
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
def loadDataSet(fileName):
dataMat = []
labelMat = []
with open(fileName) as fr:
for line in fr.readlines():
lineArr = line.strip().split(' ')
dataline = list(map(float, lineArr))
dataMat.append([float(dataline[0]), float(dataline[1]), float(dataline[2])])
labelMat.append(int(dataline[3]))
return dataMat, labelMat
#加载训练集
train_data,train_label = loadDataSet('img/txt/left.txt') #1.加载一个txt数据集
train_data = mat(train_data)
train_data=np.array(train_data, dtype='float32')
# train_label = mat(train_label)
# print(train_data.shape,train_label)
#加载测试集
test_data,test_label = loadDataSet('img/txt/right.txt') #1.加载一个txt数据集
test_data = mat(test_data)
test_data=np.array(test_data, dtype='float32')
# test_label=mat(test_label)
# n_estimators:基分类器提升(循环)次数,默认是50次,这个值过大,模型容易过拟合;值过小,模型容易欠拟合。
# algorithm:有两种方式SAMME, 和SAMME.R两种,默认是SAMME.R,两者的区别主要是弱学习器权重的度量,前者是
# 对样本集预测错误的概率进行划分的,后者是对样本集的预测错误的比例,即错分率进行划分的,默认是用的SAMME.R.
clf = AdaBoostClassifier(n_estimators=100,algorithm='SAMME.R',learning_rate=0.1)
clf.fit(train_data,train_label)
total=0
for i in range(len(test_data)):
data=test_data[i]
label=test_label[i]
data=np.array(data).reshape(1, -1)
pred = clf.predict(data)
total += accuracy_score(pred, [label])
acc=total/len(test_data)
print(acc)
代码实现:随机森林
from numpy import *
import numpy as np
import cv2
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import cross_val_score
def loadDataSet(fileName):
dataMat = []
labelMat = []
with open(fileName) as fr:
for line in fr.readlines():
lineArr = line.strip().split(' ')
dataline = list(map(float, lineArr))
dataMat.append([float(dataline[0]), float(dataline[1]), float(dataline[2])])
labelMat.append(int(dataline[3]))
return dataMat, labelMat
#加载训练集
train_data,train_label = loadDataSet('img/txt/left.txt') #1.加载一个txt数据集
train_data = mat(train_data)
train_data=np.array(train_data, dtype='float32')
# train_label = mat(train_label)
# print(train_data.shape,train_label)
#加载测试集
test_data,test_label = loadDataSet('img/txt/right.txt') #1.加载一个txt数据集
test_data = mat(test_data)
test_data=np.array(test_data, dtype='float32')
# test_label=mat(test_label)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(test_data,test_label,test_size=0.3)
# 单一决策树
clf = DecisionTreeClassifier(random_state=0)
clf = clf.fit(Xtrain,Ytrain)
score_c = clf.score(Xtest,Ytest)
print("Single Tree:{}".format(score_c))
# 随机森林
# oob_score:交叉验证方法 criterion:与决策树一样,可以选择gini或entropy(信息增益)
rfc = RandomForestClassifier(criterion='gini', oob_score =True ,n_jobs = 1,random_state =1, max_features = "auto")
rfc = rfc.fit(Xtrain,Ytrain)
score_r = rfc.score(Xtest,Ytest)
print("Random Forest:{}".format(score_r))
results = rfc.predict(Xtest)
results_proba = rfc.predict_proba(Xtest) # predict_proba(x):给出带有概率值的结果。每个点在所有label的概率和为1.
print(results)
# print(results_proba)