实验目的
掌握大规模数据集渐进融合学习框架的使用方法
实验内容
将大规模数据集Magic Dataset(记作 )随机划分成训练集
D
^
\hat D
D^和验证集
D
D
D两部分,之后确定训练集 的随机样本划分
D
^
1
,
D
^
2
,
.
.
.
,
D
^
k
\hat D_1,\hat D_2,...,\hat D_k
D^1,D^2,...,D^k,其中 且对于任意的
i
,
j
∈
1
,
2
,
.
.
.
,
k
,
D
^
i
∩
D
^
j
=
∅
i,j∈{1,2,...,k}, \hat D_i∩\hat D_j = \varnothing
i,j∈1,2,...,k,D^i∩D^j=∅, 。基于每一个随机样本划分数据块训练一个分类器,得到一个分类器集合
f
1
,
f
2
,
.
.
.
,
f
k
f_1,f_2,...,f_k
f1,f2,...,fk,之后依次验证
f
1
,
f
1
+
f
2
,
.
.
.
,
f
1
+
f
2
+
.
.
.
+
f
k
f_1,f_1+f_2,...,f_1+f_2+...+f_k
f1,f1+f2,...,f1+f2+...+fk, 在验证集
D
D
D上的预测精度,并绘制相应的学习曲线。
我们要求使用两种不同类型的分类器(例如,贝叶斯、神经网络、决策树等),即绘制两个学习曲线图。观察学习曲线的收敛情况,有能力的同学可以与非随机样本划分数据块的渐进融合进行比较,得出相应的实验结论。
数据划分
我们首先导入Magic数据集,并划分为train和test(0.7、0.3)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.model_selection import learning_curve, validation_curve, cross_val_score, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore')
import random
import seaborn as sns
sns.set_theme()
data = np.loadtxt("../data/Magic Dataset.txt")
X = data[:,:-1]
y = data[:,-1]
print(data.shape)
print(data)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print('train shape',X_train.shape)
print('test shape',X_test.shape)
接着,我们进行RSP数据块的划分
RSP的划分方式为:首先将数据集按顺序等分为HDFS数据块,在按对应位置抽取数据组合为RSP数据块
def data2HDFS_RSP(data, K=65, M=20):
'''''
先按HDFS数据块划分,再划分为RSP数据块
K: HDFS 个数
M: RSP个数
'''
try:
# 按顺序切分为k份
HDFS=np.array(np.split(data,K))
for i in range(HDFS.shape[0]):
np.random.shuffle(HDFS[i])
HDFS_list=[np.split(D_k,M) for D_k in HDFS]
# 划分RSP
RSP=[[D_K[m] for D_K in HDFS_list] for m in range(M)]
for idx,RSP_ in enumerate(RSP):
tmp_RSP=RSP_[0]
for i in range(1,len(RSP_)):
tmp_RSP=np.vstack((tmp_RSP,RSP_[i]))
RSP[idx]=tmp_RSP
RSP=np.array(RSP)
except:
warnings.warn("wrong dimension of K and M")
return (HDFS, RSP)
# 划分训练集
data_train = np.hstack([X_train, y_train[:,np.newaxis]])[:13000, :]
HDFS, RSP = data2HDFS_RSP(data_train, K=65, M=20)
print('HDFS: [块数: {0} 块内元素个数: {1} 数据块维度: {2}]'.format(
HDFS.shape[0],HDFS.shape[1],HDFS.shape[2]))
print('RSP: [块数: {0} 块内元素个数: {1} 数据块维度: {2}]'.format(
RSP.shape[0],RSP.shape[1],RSP.shape[2]))
HDFS_list = [(HDFS[i,:,:-1],HDFS[i,:,-1]) for i in range(HDFS.shape[0])]
RSP_list = [(RSP[i,:,:-1],RSP[i,:,-1]) for i in range(RSP.shape[0])]
HDFS: [块数: 65 块内元素个数: 200 数据块维度: 11]
RSP: [块数: 20 块内元素个数: 650 数据块维度: 11]
划分得到HDFS的数量为65,每块HDFS内部含有200个数据点;得到RSP的数量为20,每块RSP内部含有650个数据点。HDFS和RSP的数据点都含有10个特征
我们观察RSP数据块的特征分布情况,检验各个RSP数据块的分布是否一致
plt.style.use('seaborn')
fig,axes = plt.subplots(ncols=2,nrows=5,figsize=[18,18])
RSP_sample = random.sample(list(range(X.shape[1])),5)
for i,ax_ in enumerate(axes.flat[:10]):
ax_.set_title('feature '+str(i+1))
# RSP
for j in RSP_sample:
sns.distplot(RSP[j,:,i],kde=True,ax=ax_,bins=15)
由图1我们可以观察到,不同RSP数据块之间的分布较为相似,这说明各个RSP之间的差异很小,可以代表总体的分布
模型训练
这一步我们进行模型的训练,我总共挑选了KNN、SVM、Decision Tree、RandomForest、GaussianNB这五种模型,这五种模型可以很好的概括总的机器学习模型
在模型训练中,一种很好的观察模型训练情况的方法就是绘制模型的学习曲线,所谓学习曲线,就是当训练样本逐渐增加的时候,模型在训练集和测试集的accuracy变化曲线
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from copy import deepcopy
# def plot_learningCurve(pipes, X, y):
# #case1:学习曲线
# #构建学习曲线评估器,train_sizes:控制用于生成学习曲线的样本的绝对或相对数量
# train_sizes,train_scores, test_scores = learning_curve(estimator=pipes,X=X,y=y,train_sizes=np.linspace(0.1,1.0,10),cv=10,n_jobs=1)
# # cv = KFold(n_splits = 10)
# # test_scores = cross_val_score(pipes, X_test, y_test, cv = cv)
# # print(train_scores)
# # print('train',train_scores.shape)
# # print('test',test_scores.shape)
# #统计结果
# train_mean= np.mean(train_scores,axis=1)
# train_std = np.std(train_scores,axis=1)
# test_mean =np.mean(test_scores,axis=1)
# test_std=np.std(test_scores,axis=1)
# #绘制效果
# plt.plot(train_sizes,train_mean,color='blue',marker='o',markersize=5,label='training accuracy')
# plt.fill_between(train_sizes,train_mean+train_std,train_mean-train_std,alpha=0.15,color='blue')
# plt.plot(train_sizes,test_mean,color='green',linestyle='--',marker='s',markersize=5,label='test accuracy')
# plt.fill_between(train_sizes,test_mean+test_std,test_mean-test_std,alpha=0.15,color='green')
# plt.grid()
# plt.xlabel('Number of training samples')
# plt.ylabel('Accuracy')
# plt.legend(loc='lower right')
# plt.title(pipes['clf'].__class__.__name__)
# plt.show()
def plot_learningCurve_(pipes, X_train, y_train, X_test, y_test, train_sizes = 10):
"""
绘制学习曲线图
"""
train_score_list = []
test_score_list = []
nums_list = []
for rate in np.linspace(0.05, 1, train_sizes):
nums = int(X_train.shape[0]*rate)
nums_list.append(nums)
train_data = X_train[:nums,:]
test_data = y_train[:nums,:]
pipes.fit(train_data, test_data)
# train
train_score = cross_val_score(pipes, train_data, test_data, cv = 10)
train_score_list.append(train_score)
# test
test_score = pipes.score(X_test, y_test)
test_score_list.append(test_score)
train_scores = np.array(train_score_list)
test_scores = np.array(test_score_list)
train_mean= np.mean(train_scores,axis=1)
train_std = np.std(train_scores,axis=1)
# test_mean =np.mean(test_scores,axis=1)
# test_std=np.std(test_scores,axis=1)
#绘制效果
# train
plt.figure(figsize=[12,8])
plt.plot(nums_list,train_mean,color='blue',marker='o',markersize=5,label='training accuracy')
plt.fill_between(nums_list,train_mean+train_std,train_mean-train_std,alpha=0.15,color='blue')
# test
plt.plot(nums_list,test_scores,color='green',linestyle='--',marker='s',markersize=5,label='test accuracy')
plt.grid(True)
plt.xlabel('Number of training samples')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.title(pipes['clf'].__class__.__name__)
plt.show()
classifiers = [
KNeighborsClassifier(3),
SVC(kernel="linear", C=0.025),
DecisionTreeClassifier(max_depth=5),
RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
GaussianNB(),
]
pipeline_list = [
Pipeline([('scl',StandardScaler()), ('clf',clf)])
for clf in classifiers
]
X_train_ = RSP_list[0][0]
y_train_ = RSP_list[0][1][:,np.newaxis]
for pipes in pipeline_list:
plot_learningCurve_(pipes, X_train_, y_train_, X_test, y_test, train_sizes=20)
通过上述的学习曲线,我们可以观察到蓝色的折线(模型的训练曲线)的波动较大,但总体的趋势是递增的;绿色的折线(模型在测试集上的准确率)是逐步提升的,这说明我们模型的表现率在随着训练样本的增加而不断增加
渐进融合
下面我们进行模型的渐进融合,我采取的方式是
- 在20个RSP数据块上训练20个分类器
- 每个分类器在测试集上输出softmax后的概率值(p1,p2)
- 我们将每个分类器上的概率值取平均得到(p1’,p2’)
- 然后比较p1’和p2’来确定某个样本点所属的类别
步骤大致如上,我们一开始只用一个模型,然后逐渐增加模型的数量,直至使用完所有的模型
进行完以上流程之后,我们绘制模型的渐进融合曲线,观察accuracy趋势
RSP数据块下渐进融合
# (以下为渐进融合中所用到的函数)
def score_(y_pred, y_test):
correct = 0
for i in range(y_pred.shape[0]):
if y_pred[i] == y_test[i]:
correct += 1
return correct/y_pred.shape[0]
def score_Test_ensemble(model_list, X_test, y_test):
score_list = []
for i in range(1,len(model_list)):# 渐进融合
partial_list = []
for clf in model_list[:i]:# 集成模型集合
partial_list.append(clf.predict_proba(X_test))
# 融合
partial_list = np.mean(
np.array(partial_list),
axis=0
)
y_pred = np.array([1 if x[0]>=x[1]else 2 for x in partial_list])
score_list.append(score_(y_pred, y_test))
return score_list
def plot_ensemble_curve(model_score, name_model, stride=1):
plt.figure(figsize=[12,8])
for score_list, name in zip(model_score, name_model):
plt.plot(range(len(score_list)),score_list, 'o--', label = name)
_ = plt.xticks(range(0, len(score_list), stride))
plt.legend()
plt.xlabel('estimators nums')
plt.ylabel('accuracy')
def plot_curve(score_list, name, stride=1):
plt.figure(figsize=[12,8])
plt.plot(range(len(score_list)),score_list, 'o--', label = name)
_ = plt.xticks(range(0, len(score_list), stride))
plt.title(name)
plt.xlabel('estimators nums')
plt.ylabel('accuracy')
decisionTree_list = []
naiveBayes_list = []
KNN_list = []
SVC_list = []
RandomForest_list = []
for X_rsp, y_rsp in RSP_list:
decisionTree_list.append(
Pipeline([('scl',StandardScaler()), ('clf',DecisionTreeClassifier(max_depth=5))]).fit(X_rsp, y_rsp)
)
naiveBayes_list.append(
Pipeline([('scl',StandardScaler()), ('clf',GaussianNB())]).fit(X_rsp, y_rsp)
)
KNN_list.append(
Pipeline([('scl',StandardScaler()), ('clf',KNeighborsClassifier(3))]).fit(X_rsp, y_rsp)
)
SVC_list.append(
Pipeline([('scl',StandardScaler()), ('clf',SVC(kernel="rbf", C=1, probability=True))]).fit(X_rsp, y_rsp)
)
# RandomForest_list.append(
# Pipeline([('scl',StandardScaler()), ('clf',RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1))]).fit(X_rsp, y_rsp)
# )
model_score = []
name_model = []
for i, model_list in enumerate([decisionTree_list, naiveBayes_list, KNN_list, SVC_list]):
name = str(model_list[0]['clf']).split('(')[0]
print(name)
name_model.append(name)
model_score.append(score_Test_ensemble(model_list, X_test, y_test))
plot_ensemble_curve(model_score, name_model)
for i in range(len(model_score)):
plot_curve(model_score[i], name_model[i])
通过以上的accuracy趋势图,我们能够发现两个现象
- 各个模型在测试集上的accuracy随着融合的模数量的增加而增加
- 决策树、KNN、SVM模型的分类情况都不错,可是朴素贝叶斯模型的分类准确率较低,猜测是因为模型的特征之间并不完全独立,违反了朴素贝叶斯模型的独立假设
HDFS数据块下渐进融合
HDFS的渐进融合的方式与RSP完全相似,只不过模型训练的数据变为了HDFS数据块
decisionTree_list = []
naiveBayes_list = []
KNN_list = []
SVC_list = []
RandomForest_list = []
for X_hdfs, y_hdfs in HDFS_list:
decisionTree_list.append(
Pipeline([('scl',StandardScaler()), ('clf',DecisionTreeClassifier(max_depth=5))]).fit(X_hdfs, y_hdfs)
)
naiveBayes_list.append(
Pipeline([('scl',StandardScaler()), ('clf',GaussianNB())]).fit(X_hdfs, y_hdfs)
)
KNN_list.append(
Pipeline([('scl',StandardScaler()), ('clf',KNeighborsClassifier(3))]).fit(X_hdfs, y_hdfs)
)
SVC_list.append(
Pipeline([('scl',StandardScaler()), ('clf',SVC(kernel="rbf", C=1, probability=True))]).fit(X_hdfs, y_hdfs)
)
# RandomForest_list.append(
# Pipeline([('scl',StandardScaler()), ('clf',RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1))]).fit(X_rsp, y_rsp)
# )
model_score = []
name_model = []
for i, model_list in enumerate([decisionTree_list, naiveBayes_list, KNN_list, SVC_list]):
name = str(model_list[0]['clf']).split('(')[0]
print(name)
name_model.append(name)
model_score.append(score_Test_ensemble(model_list, X_test, y_test))
plot_ensemble_curve(model_score, name_model, stride=2)
for i in range(len(model_score)):
plot_curve(model_score[i], name_model[i], 2)
通过上述渐进融合趋势图,我们可以发现
- 不同模型的渐进融合趋势都随着融合模型的个数增加而增加,最后趋于收敛
- 朴素贝叶斯模型的渐进融合趋势图在融合的模型个数较少时先是随着融合模型的个数增加而波动着减少,在融合的模型个数超过14个时,渐进融合趋势随着融合模型的个数增加而增加,这里猜测是因为HDFS数据块的分布并不服从数据集总体的分布,所以在模型的渐进式融合过程中,会产生较为剧烈的波动
实验结论
- 在RSP数据块上进行渐进融合时,各个模型在测试集上的accuracy随着融合的模数量的增加而增加,决策树、KNN、SVM模型的分类情况都不错,可是朴素贝叶斯模型的分类准确率较低,猜测是因为模型的特征之间并不完全独立,违反了朴素贝叶斯模型的独立假设
- 在HDFS数据块上进行渐进融合时,朴素贝叶斯模型的渐进融合趋势图在融合的模型个数较少时,先是随着融合模型的个数增加而波动着减少;而在融合的模型个数超过14个时,渐进融合趋势随着融合模型的个数增加而增加。这里猜测是因为HDFS数据块的分布并不服从数据集总体的分布,所以在模型的渐进式融合过程中,会产生较为剧烈的波动