使用了机器学习的主要分类算法,包括xgboost,lightgbm,catboost等等。同时实现了stacking方法。
from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
import numpy as np
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import pandas as pd
from functools import reduce
from sklearn import tree, svm, naive_bayes,neighbors
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
# 导入数据集切割训练与测试数据
data = load_digits()
data_D = preprocessing.StandardScaler().fit_transform(data.data)
data_L = data.target
data_train, data_test, label_train, label_test = train_test_split(data_D, data_L, random_state=1, test_size=0.7)
print(np.shape(data_train))
print(np.shape(data_test))
clfs = {'svm': svm.SVC(kernel='rbf',probability=True),
'decision_tree':tree.DecisionTreeClassifier(),
'naive_gaussian': naive_bayes.GaussianNB(),
# 'naive_mul':naive_bayes.MultinomialNB(),
'K_neighbor' : neighbors.KNeighborsClassifier(),
'bagging_knn' : BaggingClassifier(neighbors.KNeighborsClassifier()),
'bagging_tree': BaggingClassifier(tree.DecisionTreeClassifier()),
# 'random_forest' : RandomForestClassifier(),
'logistic_reg':LogisticRegression(random_state=1),
'adaboost':AdaBoostClassifier(),
# 'gradient_boost' : GradientBoostingClassifier(),
'xgboost':XGBClassifier(objective='multi:softprob'),
'RandomForest':RandomForestClassifier(),
'lightgbm':LGBMClassifier(boosting_type='gbdt',objective='multiclass',metric='multi_error'),
# 'catboost':CatBoostClassifier(loss_function='MultiClass',custom_loss=['F1'])
}
def get_oof(clf, n_folds, X_train, y_train, X_test, model_name):
ntrain = X_train.shape[0]
ntest = X_test.shape[0]
classnum = len(np.unique(y_train))
# kf = KFold(n_splits=n_folds, random_state=1)
kf = StratifiedKFold(n_splits=n_folds,random_state=22)
oof_train = np.zeros((ntrain, classnum))
oof_test = np.zeros((ntest, classnum))
for i, (train_index, val_index) in enumerate(kf.split(X_train,y_train)):
kf_X_train = X_train[train_index] # 数据
kf_y_train = y_train[train_index] # 标签
kf_X_val = X_train[val_index] # k-fold的验证集
# if model_name in ['svm', 'RF', 'KNN']:
# clf.fit(kf_X_train, kf_y_train)
if model_name in [ 'catboost','xgboost','lightgbm']:
clf.fit(kf_X_train, kf_y_train, verbose=False)
else:
clf.fit(kf_X_train, kf_y_train)
oof_train[val_index] = clf.predict_proba(kf_X_val)
oof_test += clf.predict_proba(X_test)
oof_test = oof_test / float(n_folds)
return oof_train, oof_test
# 单纯使用一个分类器的时候
# clf_second = RandomForestClassifier()
# clf_second = LGBMClassifier()
# clf_second.fit(data_train, label_train)
# pred = clf_second.predict(data_test)
# accuracy = metrics.accuracy_score(label_test, pred)*100
# print(accuracy)
# 91.0969793323
# 使用stacking方法的时候
# 第一级,重构特征当做第二级的训练集
# modelist = ['SVM', 'xgboost','lightgbm','catboost', 'RF', 'KNN']
newfeature_list = []
newtestdata_list = []
# print(clfs.items())
for i, clf in enumerate(clfs.items()):
print(i+1,":"+clf[0]+" start training...")
# clf_first = SelectModel(modelname)
oof_train_, oof_test_ = get_oof(clf=clf[1], n_folds=5, X_train=data_train, y_train=label_train,
X_test=data_test,model_name=clf[0])
newfeature_list.append(oof_train_)
newtestdata_list.append(oof_test_)
print('Feature combination...')
print(np.shape(newfeature_list)) # (11, 539, 10)
newfeature = reduce(lambda x, y: np.concatenate((x, y), axis=1), newfeature_list)
print(np.shape(newfeature)) # (539, 110)
newtestdata = reduce(lambda x, y: np.concatenate((x, y), axis=1), newtestdata_list)
# 第二级,使用上一级输出的当做训练集
# print('Hyperparameter optimization...')
# def acc_model(params,X_train,Y_train):
# clf = RandomForestClassifier(**params)
# return cross_val_score(clf, X_train, Y_train).mean()
#
# param_space = {
# # 'max_depth': hp.choice('max_depth', range(1,20)),
# 'max_depth': hp.choice('max_depth', range(15, 20)),
# # 'max_features': hp.choice('max_features', range(10,150)),
# 'max_features': hp.choice('max_features', range(10,15)),
# # 'n_estimators': hp.choice('n_estimators', range(100,500)),
# 'n_estimators': hp.choice('n_estimators', range(10, 20)),
# 'criterion': hp.choice('criterion', ["gini", "entropy"])}
#
# best = 0
# def f(params):
# global best,param_best
# acc = acc_model(params,newfeature,label_train)
# if acc > best:
# best = acc
# param_best=params
# # print('\nnew best:', best, params)
# return {'loss': -acc, 'status': STATUS_OK}
#
# fmin(f, param_space, algo=tpe.suggest, max_evals=100, trials=Trials())
# print('best:')
# print(param_best)
# clf_second = RandomForestClassifier(**param_best)
# clf_second.fit(newfeature, label_train)
# oof = clf_second1.predict(newfeature)
# pred = clf_second.predict(newtestdata)
# clf = { 'lightgbm':LGBMClassifier(boosting_type='gbdt',objective='multiclass',metric='multi_error')}
clf = {'catboost':CatBoostClassifier(loss_function='MultiClass',custom_loss=['F1'])}
# print(clf.values())
oof, pred = get_oof(clf=clf['catboost'], n_folds=5, X_train=newfeature, y_train=label_train,
X_test=newtestdata, model_name='catboost')
pred = np.argmax(pred, axis=1)
print('f1', metrics.f1_score(pred, label_test, average='macro'))
# f1 0.9634845862097758
# accuracy = metrics.accuracy_score(label_test, pred) * 100
# print(accuracy)
# 96.4228934817
# 96.66136724960255
# 96.89984101748807