赛题概况
比赛地址:个贷违约预测 Competitions - DataFountain
比赛要求根据给定的数据集,建立模型,捕捉不同业务中用户基本信息与违约行为之间的关联,实现对新业务的用户违约预测。
部分代码示例
这里给大家提供一个baseline
from xgboost import XGBClassifier
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPRegressor,MLPClassifier
from sklearn.linear_model import BayesianRidge
from catboost import CatBoostClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import re
import pandas as pd
import lightgbm as lgb
import numpy as np
from sklearn.metrics import roc_auc_score, precision_recall_curve, roc_curve, average_precision_score,mean_squared_error
from sklearn.model_selection import KFold
from lightgbm import LGBMClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import gc
from sklearn.model_selection import StratifiedKFold
from dateutil.relativedelta import relativedelta
train_data = pd.read_csv('./train_public.csv')
submit_example = pd.read_csv('./submit_example.csv')
test_public = pd.read_csv('./test_public.csv')
train_inte = pd.read_csv('./train_internet.csv')
pd.set_option('max_columns', None)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.6f' % x)
def train_model(data_, test_, y_, folds_):
oof_preds = np.zeros(data_.shape[0])
sub_preds = np.zeros(test_.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in data_.columns if f not in ['loan_id', 'user_id', 'isDefault'] ]
for n_fold, (trn_idx, val_idx) in enumerate(folds_.split(data_)):
trn_x, trn_y = data_[feats].iloc[trn_idx], y_.iloc[trn_idx]
val_x, val_y = data_[feats].iloc[val_idx], y_.iloc[val_idx]
clf = LGBMClassifier(
n_estimators=4000,
learning_rate=0.08,
num_leaves=2**5,
colsample_b