使用pandas读取特征数据,并处理数据中的双引号
使用xgboost训练一版模型
xgboost==1.6.2
#!/usr/bin/env python
# -*- coding:utf-8 -*-
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import xgboost as xgb
import logging
import csv
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
model_version = "v101"
root_path = "/home/.../model/xgboost_tool"
class DataProcess(object):
def __init__(self, train_path, test_path):
self.train_data = pd.read_csv(train_path, header=None, sep="\t", quoting=csv.QUOTE_MINIMAL, escapechar='\\')
self.train_data.replace('""', '', inplace=True)
self.train_df = self.train_data.apply(pd.to_numeric, errors='coerce')
self.test_data = pd.read_csv(test_path, header=None, sep="\t", quoting=csv.QUOTE_MINIMAL, escapechar='\\')
self.test_data.replace('""', '', inplace=True)
self.test_df = self.test_data.apply(pd.to_numeric, errors='coerce')
def data_process(self, mode="train"):
if mode == "train":
X_train, X_dev, Y_train, Y_dev = self.train_data_process()
return X_train, X_dev, Y_train, Y_dev
else:
X_test, Y_test = self.test_data_process()
return X_test, Y_test
def train_data_process(self):
data_X = self.train_df.iloc[:, 2:].astype(float)
data_Y = self.train_df.iloc[:, 0].astype(int)
X_train, X_dev, Y_train, Y_dev = train_test_split(data_X, data_Y, test_size=0.2)
return X_train, X_dev, Y_train, Y_dev
def test_data_process(self):
X_test = self.test_df.iloc[:, 2:].astype(float)
Y_test = self.test_df.iloc[:, 0].astype(int)
return X_test, Y_test
def xgb_fit_single(X_train, Y_train, X_test, Y_test):
"""模型训练"""
logging.info("Train model start...")
# 决策树数量
num_round = 10
max_depth = 5
learning_rate = 0.1
model = XGBClassifier(max_depth=max_depth, learning_rate=learning_rate, n_estimators=num_round, silent=True,
objective='binary:logistic')
eval_set = [(X_test, Y_test)]
model.fit(X_train, Y_train, eval_metric=["auc", "logloss"], eval_set=eval_set, verbose=True)
model_name = f"{num_round}_{max_depth}_{learning_rate}_{model_version}.json"
model.save_model(f"{root_path}/saved_model/{model_name}")
return model
if __name__ == '__main__':
train_file_path = "/home/.../train.csv"
test_file_path = "/home/.../test.csv"
dp = DataProcess(train_file_path, test_file_path)
X_train, X_dev, Y_train, Y_dev = dp.data_process(mode="train")
print("Start xgboost training")
model = xgb_fit_single(X_train, Y_train, X_dev, Y_dev)