【xgboost】使用xgboost训练一个简单模型

使用pandas读取特征数据,并处理数据中的双引号
使用xgboost训练一版模型
xgboost==1.6.2

#!/usr/bin/env python
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import logging
import csv
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss

model_version = "v101"
root_path = "/home/.../model/xgboost_tool"

class DataProcess(object):
    def __init__(self, train_path, test_path):
        self.train_data = pd.read_csv(train_path, header=None, sep="\t", quoting=csv.QUOTE_MINIMAL, escapechar='\\')
        self.train_data.replace('""', '', inplace=True)
        self.train_df = self.train_data.apply(pd.to_numeric, errors='coerce')

        self.test_data = pd.read_csv(test_path, header=None, sep="\t", quoting=csv.QUOTE_MINIMAL, escapechar='\\')
        self.test_data.replace('""', '', inplace=True)
        self.test_df = self.test_data.apply(pd.to_numeric, errors='coerce')

    def data_process(self, mode="train"):
        if mode == "train":
            X_train, X_dev, Y_train, Y_dev = self.train_data_process()
            return X_train, X_dev, Y_train, Y_dev
        else:
            X_test, Y_test = self.test_data_process()
            return X_test, Y_test

    def train_data_process(self):
        data_X = self.train_df.iloc[:, 2:].astype(float)
        data_Y = self.train_df.iloc[:, 0].astype(int)
        X_train, X_dev, Y_train, Y_dev = train_test_split(data_X, data_Y, test_size=0.2)
        return X_train, X_dev, Y_train, Y_dev

    def test_data_process(self):
        X_test = self.test_df.iloc[:, 2:].astype(float)
        Y_test = self.test_df.iloc[:, 0].astype(int)
        return X_test, Y_test

def xgb_fit_single(X_train, Y_train, X_test, Y_test):
    """模型训练"""
    logging.info("Train model start...")
    # 决策树数量
    num_round = 10
    max_depth = 5
    learning_rate = 0.1
    model = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=num_round, silent=True,
                          objective='binary:logistic')
    eval_set = [(X_test, Y_test)]
    model.fit(X_train, Y_train, eval_metric=["auc", "logloss"], eval_set=eval_set, verbose=True)

    model_name = f"{num_round}_{max_depth}_{learning_rate}_{model_version}.json"
    model.save_model(f"{root_path}/saved_model/{model_name}")
    return model

if __name__ == '__main__':
    train_file_path = "/home/.../train.csv"
    test_file_path = "/home/.../test.csv"
    dp = DataProcess(train_file_path, test_file_path)

    X_train, X_dev, Y_train, Y_dev = dp.data_process(mode="train")
    
    print("Start xgboost training")
    model = xgb_fit_single(X_train, Y_train, X_dev, Y_dev)
    
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值