Digit Recognizer by LightGBM

LightGBM和xgboost分别做了Kaggle的Digit Recognizer,尝试用GridSearchCV调了下参数,主要是对max_depth, learning_rate, n_estimates等参数进行调试,最后在0.9747。

能力有限,接下来也不知道该如何进一步调参。


另外xgboost的GridSearchCV还是不会用,如果有大神会的话,烦请告知。

贴下LightGBM的代码:

#!/usr/bin/python
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV

# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'num_class': 10,
    'verbose': 0,
    'metric': 'multi_logloss',
    'max_bin': 255,
    'max_depth': 7,
    'learning_rate': 0.3,
    'nthread': 4,
    'n_estimators': 85,
    #'feature_fraction': 0.8
}


def train_model(model_file='model/lgb'):
    print "load data ..."
    dataset = pd.read_csv("data/train.csv", header=0)
    d_x = dataset.iloc[:, 1:].values
    d_y = dataset.iloc[:, 0].values
    train_X, test_X, train_Y, test_Y = train_test_split(
        d_x, d_y, test_size=0.33, random_state=42)

    lgb_train = lgb.Dataset(train_X, label=train_Y)
    lgb_eval = lgb.Dataset(test_X, label=test_Y, reference=lgb_train)

    print "begin train..."
    bst = lgb.train(
        params,
        lgb_train,
        valid_sets=[lgb_eval],
        num_boost_round=160,
        early_stopping_rounds=10)
    print "train end\nsaving..."
    bst.save_model(model_file)
    return bst


def create_submission():
    # get model
    bst = train_model()

    # load test data
    test_df = pd.read_csv("data/test.csv", header=0)
    xg_test = test_df.iloc[:, :].values
    print "predicting..."
    pred = bst.predict(xg_test)
    print "predict end."
    # create csv file
    print "create submission file..."
    pred = map(lambda x: sum([i * round(y) for i, y in enumerate(x)]), pred)
    submission = pd.DataFrame({
        'ImageId': range(1, len(pred) + 1),
        'Label': [int(x) for x in pred]
    })
    #submission.to_csv("submission.csv", index=False)
    np.savetxt(
        'submission.csv',
        np.c_[range(1, len(pred) + 1), pred],
        delimiter=',',
        header='ImageId,Label',
        comments='',
        fmt='%d')
    print "----end----"


def tune_model():
    print "load data ..."
    dataset = pd.read_csv("data/train.csv", header=0)
    d_x = dataset.iloc[:, 1:].values
    d_y = dataset.iloc[:, 0].values

    print "create classifier..."
    param_grid = {
        #"reg_alpha": [0.3, 0.7, 0.9, 1.1],
        "learning_rate": [0.1, 0.25, 0.3],
        'n_estimators': [75, 80, 85, 90],
        'max_depth': [6, 7, 8, 9]
    }
    params = {
        'objective': 'multiclass',
        'metric': 'multi_logloss',
        'max_bin': 255,
        'max_depth': 7,
        'learning_rate': 0.25,
        'n_estimators': 80,
    }
    # max_depth = 7, learning_rate:0.25
    model = lgb.LGBMClassifier(
        boosting_type='gbdt', objective="multiclass", nthread=8, seed=42)
    model.n_classes = 10
    print "run grid search..."
    searcher = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
    searcher.fit(d_x, d_y)
    print searcher.grid_scores_
    print "=" * 30, '\n'
    print searcher.best_params_
    print "=" * 30, '\n'
    print searcher.best_score_
    print "end"


if __name__ == "__main__":
    #create_submission()
    tune_model()


另外,xgboost的代码:

# -*- coding: utf-8 -*-
#!/usr/bin/python
import codecs
import os
import time

import pandas as pd
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn import metrics
import sklearn.preprocessing as sp

params = {
    "objective": "multi:softmax",
    "eta": 0.25,
    'max_depth': 7,
    'silent': 1,
    'nthread': 4,
    'num_class': 10,
}


def train_model():
    print "load data ..."
    dataset = pd.read_csv("data/train.csv", header=0)
    train_X = dataset.iloc[:, 1:].values
    train_Y = dataset.iloc[:, 0].values
    xg_train = xgb.DMatrix(train_X, label=train_Y)

    print "begin train..."
    bst = xgb.train(params, xg_train, 10)
    print "train end\nsaving..."
    bst.save_model("model/bst")
    return bst


def create_submission():
    test_df = pd.read_csv("data/test.csv", header=0)
    xg_test = xgb.DMatrix(test_df.iloc[:, :].values)

    bst = train_model()

    print "predicting..."
    pred = bst.predict(xg_test)
    print "predict end."
    # create csv file
    print "create submission file..."
    submission = pd.DataFrame({
        'ImageId': range(1, len(pred) + 1),
        'Label': [int(x) for x in pred]
    })
    #submission.to_csv("submission.csv", index=False)
    np.savetxt(
        'submission.csv',
        np.c_[range(1, len(pred) + 1), pred],
        delimiter=',',
        header='ImageId,Label',
        comments='',
        fmt='%d')
    print "----end----"


def tune_parameters():
    print "load data ..."
    dataset = pd.read_csv("data/train.csv", header=0)
    train_X = dataset.iloc[:100, 1:].values
    train_Y = dataset.iloc[:100, :1].values
    xg_train = xgb.DMatrix(train_X, label=train_Y)

    param_grid = {'learning_rate': [0.1, 0.4]}
    print "create classifier..."
    model = xgb.XGBClassifier(
        max_depth=6,
        learning_rate=0.1,
        n_estimators=10,
        silent=True,
        objective="multi:softmax",
        seed=36,
        nthread=8)
    searcher = GridSearchCV(
        estimator=model, param_grid=param_grid, scoring='roc_auc', cv=3)
    #train_Y = [sum(x) for x in train_Y]
    train_Y = sp.label_binarize(train_Y, classes=range(0, 10))
    #print train_Y.shape, train_X.shape
    #print train_Y[66, 9]
    print "fitting ..."
    searcher.fit(train_X, train_Y)
    print searcher.grid_scores_, searcher.best_params_, searcher.best_score_
    print "end..."


if __name__ == "__main__":
    tune_parameters()








  • 2
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
digit recognizer乳腺癌数据集是一个用于乳腺癌检测的数据集。该数据集包含了一系列数字图像,每个图像都代表一个乳腺组织样本。 在这个数据集中,每个样本由一个28x28像素的灰度图像组成。每个像素的值介于0和255之间,代表了图像中对应位置的灰度级别。每个图像都经过了预处理和标准化,以确保数据的一致性。 除了图像数据之外,该数据集还包含了对应每个样本的标签信息。标签是一个整数,表示了对应样本的乳腺组织是否患有癌症。标签为0表示正常样本,标签为1表示癌症样本。 下载digit recognizer乳腺癌数据集可以通过以下步骤进行: 1. 打开digit recognizer乳腺癌数据集的官方网站或相关数据源。 2. 查找和点击数据集的下载链接或按钮。 3. 根据网站要求,可能需要填写一些信息或同意一些使用条款。 4. 等待数据集的下载完成。 5. 将下载的数据集解压缩到你想要保存的目录。 下载完成后,你就可以使用这个数据集进行乳腺癌的分类或其他相关任务了。你可以使用机器学习算法、深度学习模型或其他方法来对这个数据集进行训练和测试,以预测新样本是否患有乳腺癌。 总之,digit recognizer乳腺癌数据集是一个用于乳腺癌检测的数据集,包含了乳腺组织样本的数字图像和对应的标签信息。你可以通过官方网站或相关数据源下载这个数据集,并用它来进行乳腺癌的分类任务。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值