Train the algorithms
GaussianNB
iris
= datasets.
load_iris
()
dataset
= datasets.
make_classification
(
n_samples
=
1000
,
n_features
=
10
,
n_informative
=
2
,
n_redundant
=
2
,
n_repeated
=
0
,
n_classes
=
2
)
#10 - fold 交叉检验
kf
= cross_validation.
KFold
(
1000
,
n_folds
=
10
,
shuffle
=
True
)
#获取数据
for train_index, test_index
in kf:
X_train, X_test
= dataset
[
0
][train_index
],dataset
[
0
][test_index
]
y_train, y_test
= dataset
[
1
][train_index
],dataset
[
1
][test_index
]
#高斯朴素贝叶斯
clf
=
GaussianNB
()
clf.
fit
(X_train, y_train)
pred
= clf.
predict
(X_test)
acc
= metrics.
accuracy_score
(y_test, pred)
f1
= metrics.
f1_score
(y_test, pred)
auc
= metrics.
roc_auc_score
(y_test, pred)
结果如下:
SVC (possible C values [1e-02, 1e-01, 1e00, 1e01, 1e02], RBF kernel)
def
rbf_svm(
X_train,
y_train,
X_test,
C):
clf
=
SVC
(
C
= C,
kernel
=
'
rbf
'
,
class_weight
=
'
balanced
'
)
clf.
fit
(X_train,y_train)
return clf.
predict
(X_test)
#样本数量
n_sam
=
1000
iris
= datasets.
load_iris
()
dataset
= datasets.
make_classification
(
n_samples
= n_sam ,
n_features
=
10
,
n_informative
=
2
,
n_redundant
=
2
,
n_repeated
=
0
,
n_classes
=
2
)
#使用10fold交叉检验
kf
= cross_validation.
KFold
(n_sam ,
n_folds
=
10
,
shuffle
=
True
)
accuracy
=
[]
f1
=
[]
auc_roc
=
[]
for train_index, test_index
in kf:
X_train, X_test
= dataset
[
0
][train_index
],dataset
[
0
][test_index
]
y_train, y_test
= dataset
[
1
][train_index
],dataset
[
1
][test_index
]
nn
=
len
(X_train)
bestC
=
None
Cvalues
=
[
1e-2,
1e-1,
1e0,
1e1,
1e2
]
innerscore
=
[]
#用不同的参数C来对数据集进行训练
for C
in Cvalues:
#内部使用5fold交叉检验
ikf
= cross_validation.
KFold
(nn,
n_folds
=
5
,
shuffle
=
True
,
random_state
=
5678
)
innerf1
=
[]
for t_index, v_index
in ikf:
X_t, X_v
= X_train
[t_index
], X_train
[v_index
]
y_t, y_v
= y_train
[t_index
], y_train
[v_index
]
ipred
=
rbf_svm
(X_t, y_t, X_v, C)
#对检验的f1score进行保存
innerf1.
append
(metrics.
f1_score
(y_v, ipred))
innerscore.
append
(
sum
(innerf1)
/
len
(innerf1))
#选出能得出最好的f1score的C
bestC
= Cvalues
[np.
argmax
(innerscore)]
#预测分类结果
pred
=
rbf_svm
(X_train, y_train, X_test, bestC)
accuracy.
append
(metrics.
accuracy_score
(y_test,pred))
f1.
append
(metrics.
f1_score
(y_test,pred))
auc_roc.
append
(metrics.
roc_auc_score
(y_test, pred))
结果如下:
RandomForestClassifier (possible n estimators values [10, 100, 1000])
#用不同参数构建分类器
def
rbf_ranf(
X_train,
y_train,
X_test,
nest):
clf
=
RandomForestClassifier
(
n_estimators
= nest)
clf.
fit
(X_train,y_train)
return clf.
predict
(X_test)
#样本数量
n_sam
=
1000
iris
= datasets.
load_iris
()
dataset
= datasets.
make_classification
(
n_samples
= n_sam ,
n_features
=
10
,
n_informative
=
2
,
n_redundant
=
2
,
n_repeated
=
0
,
n_classes
=
2
)
#使用10fold交叉检验
kf
= cross_validation.
KFold
(n_sam ,
n_folds
=
10
,
shuffle
=
True
)
accuracy
=
[]
f1
=
[]
auc_roc
=
[]
for train_index, test_index
in kf:
X_train, X_test
= dataset
[
0
][train_index
],dataset
[
0
][test_index
]
y_train, y_test
= dataset
[
1
][train_index
],dataset
[
1
][test_index
]
nn
=
len
(X_train)
bestN
=
None
N_est
=
[
10,
100,
1000
]
innerscore
=
[]
#用不同的参数n_estimator来对数据集进行训练
for nest
in N_est:
ikf
= cross_validation.
KFold
(nn,
n_folds
=
5
,
shuffle
=
True
,
random_state
=
5678
)
innerf1
=
[]
for t_index, v_index
in ikf:
X_t, X_v
= X_train
[t_index
], X_train
[v_index
]
y_t, y_v
= y_train
[t_index
], y_train
[v_index
]
ipred
=
rbf_ranf
(X_t, y_t, X_v, nest)
innerf1.
append
(metrics.
f1_score
(y_v, ipred))
innerscore.
append
(
sum
(innerf1)
/
len
(innerf1))
bestN
= N_est
[np.
argmax
(innerscore)]
#预测分类结果
pred
=
rbf_ranf
(X_train, y_train, X_test, bestN)
accuracy.
append
(metrics.
accuracy_score
(y_test,pred))
f1.
append
(metrics.
f1_score
(y_test,pred))
auc_roc.
append
(metrics.
roc_auc_score
(y_test, pred))
结果如下:
结论,从结果来看,随机森林训练法最好