机器学习分类算法汇总

使用了机器学习的主要分类算法,包括xgboost,lightgbm,catboost等等。同时实现了stacking方法。

from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_digits
import numpy as np
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
import pandas as pd
from functools import reduce
from sklearn import tree, svm, naive_bayes,neighbors
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

# 导入数据集切割训练与测试数据
data = load_digits()
data_D = preprocessing.StandardScaler().fit_transform(data.data)
data_L = data.target
data_train, data_test, label_train, label_test = train_test_split(data_D, data_L, random_state=1, test_size=0.7)

print(np.shape(data_train))
print(np.shape(data_test))

clfs = {'svm': svm.SVC(kernel='rbf',probability=True),
        'decision_tree':tree.DecisionTreeClassifier(),
        'naive_gaussian': naive_bayes.GaussianNB(),
        # 'naive_mul':naive_bayes.MultinomialNB(),
        'K_neighbor' : neighbors.KNeighborsClassifier(),
        'bagging_knn' : BaggingClassifier(neighbors.KNeighborsClassifier()),
        'bagging_tree': BaggingClassifier(tree.DecisionTreeClassifier()),
        # 'random_forest' : RandomForestClassifier(),
        'logistic_reg':LogisticRegression(random_state=1),
        'adaboost':AdaBoostClassifier(),
        # 'gradient_boost' : GradientBoostingClassifier(),
        'xgboost':XGBClassifier(objective='multi:softprob'),
        'RandomForest':RandomForestClassifier(),
        'lightgbm':LGBMClassifier(boosting_type='gbdt',objective='multiclass',metric='multi_error'),
        # 'catboost':CatBoostClassifier(loss_function='MultiClass',custom_loss=['F1'])
        }


def get_oof(clf, n_folds, X_train, y_train, X_test, model_name):
    ntrain = X_train.shape[0]
    ntest = X_test.shape[0]
    classnum = len(np.unique(y_train))
    # kf = KFold(n_splits=n_folds, random_state=1)
    kf = StratifiedKFold(n_splits=n_folds,random_state=22)
    oof_train = np.zeros((ntrain, classnum))
    oof_test = np.zeros((ntest, classnum))

    for i, (train_index, val_index) in enumerate(kf.split(X_train,y_train)):
        kf_X_train = X_train[train_index]  # 数据
        kf_y_train = y_train[train_index]  # 标签

        kf_X_val = X_train[val_index]  # k-fold的验证集

        # if model_name in ['svm', 'RF', 'KNN']:
        #     clf.fit(kf_X_train, kf_y_train)
        if model_name in [ 'catboost','xgboost','lightgbm']:
            clf.fit(kf_X_train, kf_y_train, verbose=False)
        else:
            clf.fit(kf_X_train, kf_y_train)
        oof_train[val_index] = clf.predict_proba(kf_X_val)

        oof_test += clf.predict_proba(X_test)
    oof_test = oof_test / float(n_folds)
    return oof_train, oof_test

# 单纯使用一个分类器的时候
# clf_second = RandomForestClassifier()
# clf_second = LGBMClassifier()
# clf_second.fit(data_train, label_train)
# pred = clf_second.predict(data_test)
# accuracy = metrics.accuracy_score(label_test, pred)*100
# print(accuracy)
# 91.0969793323

# 使用stacking方法的时候
# 第一级,重构特征当做第二级的训练集
# modelist = ['SVM', 'xgboost','lightgbm','catboost', 'RF', 'KNN']
newfeature_list = []
newtestdata_list = []
# print(clfs.items())
for i, clf in enumerate(clfs.items()):
    print(i+1,":"+clf[0]+" start training...")
    # clf_first = SelectModel(modelname)
    oof_train_, oof_test_ = get_oof(clf=clf[1], n_folds=5, X_train=data_train, y_train=label_train,
                                    X_test=data_test,model_name=clf[0])
    newfeature_list.append(oof_train_)
    newtestdata_list.append(oof_test_)

print('Feature combination...')
print(np.shape(newfeature_list))  # (11, 539, 10)
newfeature = reduce(lambda x, y: np.concatenate((x, y), axis=1), newfeature_list)
print(np.shape(newfeature))  # (539, 110)
newtestdata = reduce(lambda x, y: np.concatenate((x, y), axis=1), newtestdata_list)


# 第二级,使用上一级输出的当做训练集
# print('Hyperparameter optimization...')
# def acc_model(params,X_train,Y_train):
#     clf = RandomForestClassifier(**params)
#     return cross_val_score(clf, X_train, Y_train).mean()
#
# param_space = {
#     # 'max_depth': hp.choice('max_depth', range(1,20)),
#     'max_depth': hp.choice('max_depth', range(15, 20)),
#     # 'max_features': hp.choice('max_features', range(10,150)),
# 'max_features': hp.choice('max_features', range(10,15)),
#     # 'n_estimators': hp.choice('n_estimators', range(100,500)),
#     'n_estimators': hp.choice('n_estimators', range(10, 20)),
#     'criterion': hp.choice('criterion', ["gini", "entropy"])}
#
# best = 0
# def f(params):
#     global best,param_best
#     acc = acc_model(params,newfeature,label_train)
#     if acc > best:
#         best = acc
#         param_best=params
#     # print('\nnew best:', best, params)
#     return {'loss': -acc, 'status': STATUS_OK}
#
# fmin(f, param_space, algo=tpe.suggest, max_evals=100, trials=Trials())
# print('best:')
# print(param_best)

# clf_second = RandomForestClassifier(**param_best)
# clf_second.fit(newfeature, label_train)
# oof = clf_second1.predict(newfeature)
# pred = clf_second.predict(newtestdata)
# clf = { 'lightgbm':LGBMClassifier(boosting_type='gbdt',objective='multiclass',metric='multi_error')}
clf = {'catboost':CatBoostClassifier(loss_function='MultiClass',custom_loss=['F1'])}
# print(clf.values())
oof, pred = get_oof(clf=clf['catboost'], n_folds=5, X_train=newfeature, y_train=label_train,
                                X_test=newtestdata, model_name='catboost')
pred = np.argmax(pred, axis=1)
print('f1', metrics.f1_score(pred, label_test, average='macro'))
# f1 0.9634845862097758

# accuracy = metrics.accuracy_score(label_test, pred) * 100
# print(accuracy)
# 96.4228934817
# 96.66136724960255
# 96.89984101748807
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值