第二届翼支付杯大数据建模大赛-信用风险用户识别Baseline 线上0.65+稳进复赛

导入包

# coding: utf-8
#导入包
import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

读取训练集数据

#读取训练集
train_base = pd.read_csv('train/train_base.csv')
train_op = pd.read_csv('train/train_op.csv')
train_trans = pd.read_csv('train/train_trans.csv')
train_label = pd.read_csv('train/train_label.csv')

读取测试集数据

#读取测试集
test_base = pd.read_csv('test/test_a_base.csv')
test_op = pd.read_csv('test/test_a_op.csv')
test_trans = pd.read_csv('test/test_a_trans.csv')

我们发现训练集和测试集中sevice3_level字段缺失较多,因此删除此列。然后删除训练集中的sex列以及balance_avg列为空的行,得到不含缺失值的训练集。
对于测试集,其他列缺失值很少,直接用上一行的数据补全。

#去除训练集缺失值
train_df = train_df[train_df['sex'].notnull()]
train_df = train_df[train_df['balance_avg'].notnull()]
train_df.drop('service3_level',axis=1,inplace=True)
train_df.head()

test_df.drop('service3_level',axis=1,inplace=True)
test_df.fillna(method='bfill',inplace=True)
test_df.info()

对于base数据集中的类别编码字段,采用label encode,对于有明显顺序关系和类别数目较少的特征,采用label编码;对于类别较多且没有顺序关系的特征,采取暂时舍弃特征的方式。

#label编码
train_df['sex'].loc[train_df['sex']=='category 1'] = 1
train_df['sex'].loc[train_df['sex']=='category 0'] = 0

train_df['level'].loc[train_df['level']=='category 0'] = 0
train_df['level'].loc[train_df['level']=='category 1'] = 1
train_df['level'].loc[train_df['level']=='category 2'] = 2

train_df['verified'].loc[train_df['verified']=='category 1'] = 1
train_df['verified'].loc[train_df['verified']=='category 0'] = 0

train_df['agreement1'].loc[train_df['agreement1']=='category 1'] = 1
train_df['agreement1'].loc[train_df['agreement1']=='category 0'] = 0

train_df['agreement2'].loc[train_df['agreement2']=='category 1'] = 1
train_df['agreement2'].loc[train_df['agreement2']=='category 0'] = 0

train_df['agreement3'].loc[train_df['agreement3']=='category 1'] = 1
train_df['agreement3'].loc[train_df['agreement3']=='category 0'] = 0

train_df['agreement4'].loc[train_df['agreement4']=='category 1'] = 1
train_df['agreement4'].loc[train_df['agreement4']=='category 0'] = 0

train_df['balance'].loc[train_df['balance']=='level 0'] = 0
train_df['balance'].loc[train_df['balance']=='level 1'] = 1
train_df['balance'].loc[train_df['balance']=='level 2'] = 2
train_df['balance'].loc[train_df['balance']=='level 3'] = 3
train_df['balance'].loc[train_df['balance']=='level 4'] = 4
train_df['balance'].loc[train_df['balance']=='level 5'] = 5
train_df['balance'].loc[train_df['balance']=='level 6'] = 6
train_df['balance'].loc[train_df['balance']=='level 7'] = 7
train_df['balance'].loc[train_df['balance']=='level 8'] = 8
train_df['balance'].loc[train_df['balance']=='level 9'] = 9
train_df['balance'].loc[train_df['balance']=='level 10'] = 10
train_df['balance'].loc[train_df['balance']=='level 11'] = 11
train_df['balance'].loc[train_df['balance']=='level 12'] = 12
train_df['balance'].loc[train_df['balance']=='level 13'] = 13
train_df['balance'].loc[train_df['balance']=='level 14'] = 14
train_df['balance'].loc[train_df['balance']=='level 15'] = 15
train_df['balance'].loc[train_df['balance']=='level 16'] = 16
train_df['balance'].loc[train_df['balance']=='level 17'] = 17
train_df['balance'].loc[train_df['balance']=='level 18'] = 18
train_df['balance'].loc[train_df['balance']=='level 19'] = 19
train_df['balance'].loc[train_df['balance']=='level 20'] = 20
train_df['balance'].loc[train_df['balance']=='level 21'] = 21

train_df['balance_avg'].loc[train_df['balance_avg']=='level 0'] = 0
train_df['balance_avg'].loc[train_df['balance_avg']=='level 1'] = 1
train_df['balance_avg'].loc[train_df['balance_avg']=='level 2'] = 2
train_df['balance_avg'].loc[train_df['balance_avg']=='level 3'] = 3
train_df['balance_avg'].loc[train_df['balance_avg']=='level 4'] = 4
train_df['balance_avg'].loc[train_df['balance_avg']=='level 5'] = 5
train_df['balance_avg'].loc[train_df['balance_avg']=='level 6'] = 6
train_df['balance_avg'].loc[train_df['balance_avg']=='level 7'] = 7
train_df['balance_avg'].loc[train_df['balance_avg']=='level 8'] = 8
train_df['balance_avg'].loc[train_df['balance_avg']=='level 9'] = 9
train_df['balance_avg'].loc[train_df['balance_avg']=='level 10'] = 10
train_df['balance_avg'].loc[train_df['balance_avg']=='level 11'] = 11
train_df['balance_avg'].loc[train_df['balance_avg']=='level 12'] = 12
train_df['balance_avg'].loc[train_df['balance_avg']=='level 13'] = 13
train_df['balance_avg'].loc[train_df['balance_avg']=='level 14'] = 14
train_df['balance_avg'].loc[train_df['balance_avg']=='level 15'] = 15
train_df['balance_avg'].loc[train_df['balance_avg']=='level 16'] = 16
train_df['balance_avg'].loc[train_df['balance_avg']=='level 17'] = 17
train_df['balance_avg'].loc[train_df['balance_avg']=='level 18'] = 18
train_df['balance_avg'].loc[train_df['balance_avg']=='level 19'] = 19
train_df['balance_avg'].loc[train_df['balance_avg']=='level 20'] = 20
train_df['balance_avg'].loc[train_df['balance_avg']=='level 21'] = 21

train_df['balance1'].loc[train_df['balance1']=='level 0'] = 0
train_df['balance1'].loc[train_df['balance1']=='level 1'] = 1
train_df['balance1'].loc[train_df['balance1']=='level 2'] = 2
train_df['balance1'].loc[train_df['balance1']=='level 3'] = 3
train_df['balance1'].loc[train_df['balance1']=='level 4'] = 4
train_df['balance1'].loc[train_df['balance1']=='level 5'] = 5
train_df['balance1'].loc[train_df['balance1']=='level 6'] = 6
train_df['balance1'].loc[train_df['balance1']=='level 7'] = 7
train_df['balance1'].loc[train_df['balance1']=='level 8'] = 8
train_df['balance1'].loc[train_df['balance1']=='level 9'] = 9
train_df['balance1'].loc[train_df['balance1']=='level 10'] = 10
train_df['balance1'].loc[train_df['balance1']=='level 11'] = 11
train_df['balance1'].loc[train_df['balance1']=='level 12'] = 12
train_df['balance1'].loc[train_df['balance1']=='level 13'] = 13
train_df['balance1'].loc[train_df['balance1']=='level 14'] = 14
train_df['balance1'].loc[train_df['balance1']=='level 15'] = 15
train_df['balance1'].loc[train_df['balance1']=='level 16'] = 16
train_df['balance1'].loc[train_df['balance1']=='level 17'] = 17
train_df['balance1'].loc[train_df['balance1']=='level 18'] = 18
train_df['balance1'].loc[train_df['balance1']=='level 19'] = 19
train_df['balance1'].loc[train_df['balance1']=='level 20'] = 20
train_df['balance1'].loc[train_df['balance1']=='level 21'] = 21

train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 0'] = 0
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 1'] = 1
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 2'] = 2
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 3'] = 3
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 4'] = 4
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 5'] = 5
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 6'] = 6
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 7'] = 7
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 8'] = 8
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 9'] = 9
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 10'] = 10
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 11'] = 11
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 12'] = 12
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 13'] = 13
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 14'] = 14
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 15'] = 15
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 16'] = 16
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 17'] = 17
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 18'] = 18
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 19'] = 19
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 20'] = 20
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 21'] = 21

train_df['balance2'].loc[train_df['balance2']=='level 0'] = 0
train_df['balance2'].loc[train_df['balance2']=='level 1'] = 1
train_df['balance2'].loc[train_df['balance2']=='level 2'] = 2
train_df['balance2'].loc[train_df['balance2']=='level 3'] = 3
train_df['balance2'].loc[train_df['balance2']=='level 4'] = 4
train_df['balance2'].loc[train_df['balance2']=='level 5'] = 5
train_df['balance2'].loc[train_df['balance2']=='level 6'] = 6
train_df['balance2'].loc[train_df['balance2']=='level 7'] = 7
train_df['balance2'].loc[train_df['balance2']=='level 8'] = 8
train_df['balance2'].loc[train_df['balance2']=='level 9'] = 9
train_df['balance2'].loc[train_df['balance2']=='level 10'] = 10
train_df['balance2'].loc[train_df['balance2']=='level 11'] = 11
train_df['balance2'].loc[train_df['balance2']=='level 12'] = 12
train_df['balance2'].loc[train_df['balance2']=='level 13'] = 13
train_df['balance2'].loc[train_df['balance2']=='level 14'] = 14
train_df['balance2'].loc[train_df['balance2']=='level 15'] = 15
train_df['balance2'].loc[train_df['balance2']=='level 16'] = 16
train_df['balance2'].loc[train_df['balance2']=='level 17'] = 17
train_df['balance2'].loc[train_df['balance2']=='level 18'] = 18
train_df['balance2'].loc[train_df['balance2']=='level 19'] = 19
train_df['balance2'].loc[train_df['balance2']=='level 20'] = 20
train_df['balance2'].loc[train_df['balance2']=='level 21'] = 21

train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 0'] = 0
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 1'] = 1
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 2'] = 2
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 3'] = 3
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 4'] = 4
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 5'] = 5

train_df['service3'].loc[train_df['service3']=='category 1'] = 1
train_df['service3'].loc[train_df['service3']=='category 0'] = 0

train_df['product1_amount'].loc[train_df['product1_amount']=='level 1'] = 1
train_df['product1_amount'].loc[train_df['product1_amount']=='level 2'] = 2
train_df['product1_amount'].loc[train_df['product1_amount']=='level 3'] = 3
train_df['product1_amount'].loc[train_df['product1_amount']=='level 4'] = 4
train_df['product1_amount'].loc[train_df['product1_amount']=='level 5'] = 5
train_df['product1_amount'].loc[train_df['product1_amount']=='level 6'] = 6
train_df['product1_amount'].loc[train_df['product1_amount']=='level 7'] = 7

train_df['product2_amount'].loc[train_df['product2_amount']=='level 1'] = 1
train_df['product2_amount'].loc[train_df['product2_amount']=='level 2'] = 2
train_df['product2_amount'].loc[train_df['product2_amount']=='level 3'] = 3
train_df['product2_amount'].loc[train_df['product2_amount']=='level 4'] = 4
train_df['product2_amount'].loc[train_df['product2_amount']=='level 5'] = 5
train_df['product2_amount'].loc[train_df['product2_amount']=='level 6'] = 6
train_df['product2_amount'].loc[train_df['product2_amount']=='level 7'] = 7
train_df['product2_amount'].loc[train_df['product2_amount']=='level 8'] = 8
train_df['product2_amount'].loc[train_df['product2_amount']=='level 9'] = 9
train_df['product2_amount'].loc[train_df['product2_amount']=='level 10'] = 10
train_df['product2_amount'].loc[train_df['product2_amount']=='level 11'] = 11
train_df['product2_amount'].loc[train_df['product2_amount']=='level 12'] = 12
train_df['product2_amount'].loc[train_df['product2_amount']=='level 13'] = 13
train_df['product2_amount'].loc[train_df['product2_amount']=='level 14'] = 14
train_df['product2_amount'].loc[train_df['product2_amount']=='level 15'] = 15
train_df['product2_amount'].loc[train_df['product2_amount']=='level 16'] = 16
train_df['product2_amount'].loc[train_df['product2_amount']=='level 17'] = 17
train_df['product2_amount'].loc[train_df['product2_amount']=='level 18'] = 18
train_df['product2_amount'].loc[train_df['product2_amount']=='level 19'] = 19
train_df['product2_amount'].loc[train_df['product2_amount']=='level 20'] = 20
train_df['product2_amount'].loc[train_df['product2_amount']=='level 21'] = 21

train_df['product3_amount'].loc[train_df['product3_amount']=='level 1'] = 1
train_df['product3_amount'].loc[train_df['product3_amount']=='level 2'] = 2
train_df['product3_amount'].loc[train_df['product3_amount']=='level 3'] = 3

train_df['product4_amount'].loc[train_df['product4_amount']=='level 0'] = 0
train_df['product4_amount'].loc[train_df['product4_amount']=='level 1'] = 1

train_df['product5_amount'].loc[train_df['product5_amount']=='level 0'] = 0
train_df['product5_amount'].loc[train_df['product5_amount']=='level 1'] = 1

train_df['product6_amount'].loc[train_df['product6_amount']=='level 1'] = 1
train_df['product6_amount'].loc[train_df['product6_amount']=='level 2'] = 2
train_df['product6_amount'].loc[train_df['product6_amount']=='level 3'] = 3
train_df['product6_amount'].loc[train_df['product6_amount']=='level 4'] = 4
train_df['product6_amount'].loc[train_df['product6_amount']=='level 5'] = 5
train_df['product6_amount'].loc[train_df['product6_amount']=='level 6'] = 6
train_df['product6_amount'].loc[train_df['product6_amount']=='level 7'] = 7
train_df['product6_amount'].loc[train_df['product6_amount']=='level 8'] = 8
train_df['product6_amount'].loc[train_df['product6_amount']=='level 9'] = 9
train_df['product6_amount'].loc[train_df['product6_amount']=='level 10'] = 10
train_df['product6_amount'].loc[train_df['product6_amount']=='level 11'] = 11
train_df['product6_amount'].loc[train_df['product6_amount']=='level 12'] = 12
train_df['product6_amount'].loc[train_df['product6_amount']=='level 13'] = 13
train_df['product6_amount'].loc[train_df['product6_amount']=='level 14'] = 14
train_df['product6_amount'].loc[train_df['product6_amount']=='level 15'] = 15
train_df['product6_amount'].loc[train_df['product6_amount']=='level 16'] = 16
train_df['product6_amount'].loc[train_df['product6_amount']=='level 17'] = 17
train_df['product6_amount'].loc[train_df['product6_amount']=='level 18'] = 18
train_df['product6_amount'].loc[train_df['product6_amount']=='level 19'] = 19
train_df['product6_amount'].loc[train_df['product6_amount']=='level 20'] = 20
train_df['product6_amount'].loc[train_df['product6_amount']=='level 21'] = 21


test_df['sex'].loc[test_df['sex']=='category 1'] = 1
test_df['sex'].loc[test_df['sex']=='category 0'] = 0

test_df['level'].loc[test_df['level']=='category 0'] = 0
test_df['level'].loc[test_df['level']=='category 1'] = 1
test_df['level'].loc[test_df['level']=='category 2'] = 2

test_df['verified'].loc[test_df['verified']=='category 1'] = 1
test_df['verified'].loc[test_df['verified']=='category 0'] = 0

test_df['agreement1'].loc[test_df['agreement1']=='category 1'] = 1
test_df['agreement1'].loc[test_df['agreement1']=='category 0'] = 0

test_df['agreement2'].loc[test_df['agreement2']=='category 1'] = 1
test_df['agreement2'].loc[test_df['agreement2']=='category 0'] = 0

test_df['agreement3'].loc[test_df['agreement3']=='category 1'] = 1
test_df['agreement3'].loc[test_df['agreement3']=='category 0'] = 0

test_df['agreement4'].loc[test_df['agreement4']=='category 1'] = 1
test_df['agreement4'].loc[test_df['agreement4']=='category 0'] = 0

test_df['balance'].loc[test_df['balance']=='level 0'] = 0
test_df['balance'].loc[test_df['balance']=='level 1'] = 1
test_df['balance'].loc[test_df['balance']=='level 2'] = 2
test_df['balance'].loc[test_df['balance']=='level 3'] = 3
test_df['balance'].loc[test_df['balance']=='level 4'] = 4
test_df['balance'].loc[test_df['balance']=='level 5'] = 5
test_df['balance'].loc[test_df['balance']=='level 6'] = 6
test_df['balance'].loc[test_df['balance']=='level 7'] = 7
test_df['balance'].loc[test_df['balance']=='level 8'] = 8
test_df['balance'].loc[test_df['balance']=='level 9'] = 9
test_df['balance'].loc[test_df['balance']=='level 10'] = 10
test_df['balance'].loc[test_df['balance']=='level 11'] = 11
test_df['balance'].loc[test_df['balance']=='level 12'] = 12
test_df['balance'].loc[test_df['balance']=='level 13'] = 13
test_df['balance'].loc[test_df['balance']=='level 14'] = 14
test_df['balance'].loc[test_df['balance']=='level 15'] = 15
test_df['balance'].loc[test_df['balance']=='level 16'] = 16
test_df['balance'].loc[test_df['balance']=='level 17'] = 17
test_df['balance'].loc[test_df['balance']=='level 18'] = 18
test_df['balance'].loc[test_df['balance']=='level 19'] = 19
test_df['balance'].loc[test_df['balance']=='level 20'] = 20
test_df['balance'].loc[test_df['balance']=='level 21'] = 21

test_df['balance_avg'].loc[test_df['balance_avg']=='level 0'] = 0
test_df['balance_avg'].loc[test_df['balance_avg']=='level 1'] = 1
test_df['balance_avg'].loc[test_df['balance_avg']=='level 2'] = 2
test_df['balance_avg'].loc[test_df['balance_avg']=='level 3'] = 3
test_df['balance_avg'].loc[test_df['balance_avg']=='level 4'] = 4
test_df['balance_avg'].loc[test_df['balance_avg']=='level 5'] = 5
test_df['balance_avg'].loc[test_df['balance_avg']=='level 6'] = 6
test_df['balance_avg'].loc[test_df['balance_avg']=='level 7'] = 7
test_df['balance_avg'].loc[test_df['balance_avg']=='level 8'] = 8
test_df['balance_avg'].loc[test_df['balance_avg']=='level 9'] = 9
test_df['balance_avg'].loc[test_df['balance_avg']=='level 10'] = 10
test_df['balance_avg'].loc[test_df['balance_avg']=='level 11'] = 11
test_df['balance_avg'].loc[test_df['balance_avg']=='level 12'] = 12
test_df['balance_avg'].loc[test_df['balance_avg']=='level 13'] = 13
test_df['balance_avg'].loc[test_df['balance_avg']=='level 14'] = 14
test_df['balance_avg'].loc[test_df['balance_avg']=='level 15'] = 15
test_df['balance_avg'].loc[test_df['balance_avg']=='level 16'] = 16
test_df['balance_avg'].loc[test_df['balance_avg']=='level 17'] = 17
test_df['balance_avg'].loc[test_df['balance_avg']=='level 18'] = 18
test_df['balance_avg'].loc[test_df['balance_avg']=='level 19'] = 19
test_df['balance_avg'].loc[test_df['balance_avg']=='level 20'] = 20
test_df['balance_avg'].loc[test_df['balance_avg']=='level 21'] = 21

test_df['balance1'].loc[test_df['balance1']=='level 0'] = 0
test_df['balance1'].loc[test_df['balance1']=='level 1'] = 1
test_df['balance1'].loc[test_df['balance1']=='level 2'] = 2
test_df['balance1'].loc[test_df['balance1']=='level 3'] = 3
test_df['balance1'].loc[test_df['balance1']=='level 4'] = 4
test_df['balance1'].loc[test_df['balance1']=='level 5'] = 5
test_df['balance1'].loc[test_df['balance1']=='level 6'] = 6
test_df['balance1'].loc[test_df['balance1']=='level 7'] = 7
test_df['balance1'].loc[test_df['balance1']=='level 8'] = 8
test_df['balance1'].loc[test_df['balance1']=='level 9'] = 9
test_df['balance1'].loc[test_df['balance1']=='level 10'] = 10
test_df['balance1'].loc[test_df['balance1']=='level 11'] = 11
test_df['balance1'].loc[test_df['balance1']=='level 12'] = 12
test_df['balance1'].loc[test_df['balance1']=='level 13'] = 13

test_df['balance1'].loc[test_df['balance1']=='level 14'] = 14
test_df['balance1'].loc[test_df['balance1']=='level 15'] = 15
test_df['balance1'].loc[test_df['balance1']=='level 16'] = 16
test_df['balance1'].loc[test_df['balance1']=='level 17'] = 17
test_df['balance1'].loc[test_df['balance1']=='level 18'] = 18
test_df['balance1'].loc[test_df['balance1']=='level 19'] = 19
test_df['balance1'].loc[test_df['balance1']=='level 20'] = 20
test_df['balance1'].loc[test_df['balance1']=='level 21'] = 21

test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 0'] = 0
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 1'] = 1
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 2'] = 2
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 3'] = 3
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 4'] = 4
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 5'] = 5
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 6'] = 6
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 7'] = 7
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 8'] = 8
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 9'] = 9
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 10'] = 10
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 11'] = 11
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 12'] = 12
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 13'] = 13
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 14'] = 14
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 15'] = 15
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 16'] = 16
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 17'] = 17
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 18'] = 18
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 19'] = 19
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 20'] = 20
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 21'] = 21

test_df['balance2'].loc[test_df['balance2']=='level 0'] = 0
test_df['balance2'].loc[test_df['balance2']=='level 1'] = 1
test_df['balance2'].loc[test_df['balance2']=='level 2'] = 2
test_df['balance2'].loc[test_df['balance2']=='level 3'] = 3
test_df['balance2'].loc[test_df['balance2']=='level 4'] = 4
test_df['balance2'].loc[test_df['balance2']=='level 5'] = 5
test_df['balance2'].loc[test_df['balance2']=='level 6'] = 6
test_df['balance2'].loc[test_df['balance2']=='level 7'] = 7
test_df['balance2'].loc[test_df['balance2']=='level 8'] = 8
test_df['balance2'].loc[test_df['balance2']=='level 9'] = 9
test_df['balance2'].loc[test_df['balance2']=='level 10'] = 10
test_df['balance2'].loc[test_df['balance2']=='level 11'] = 11
test_df['balance2'].loc[test_df['balance2']=='level 12'] = 12
test_df['balance2'].loc[test_df['balance2']=='level 13'] = 13
test_df['balance2'].loc[test_df['balance2']=='level 14'] = 14
test_df['balance2'].loc[test_df['balance2']=='level 15'] = 15
test_df['balance2'].loc[test_df['balance2']=='level 16'] = 16
test_df['balance2'].loc[test_df['balance2']=='level 17'] = 17
test_df['balance2'].loc[test_df['balance2']=='level 18'] = 18
test_df['balance2'].loc[test_df['balance2']=='level 19'] = 19
test_df['balance2'].loc[test_df['balance2']=='level 20'] = 20
test_df['balance2'].loc[test_df['balance2']=='level 21'] = 21

test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 0'] = 0
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 1'] = 1
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 2'] = 2
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 3'] = 3
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 4'] = 4
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 5'] = 5

test_df['service3'].loc[test_df['service3']=='category 1'] = 1
test_df['service3'].loc[test_df['service3']=='category 0'] = 0
test_df
test_df['product1_amount'].loc[test_df['product1_amount']=='level 1'] = 1
test_df['product1_amount'].loc[test_df['product1_amount']=='level 2'] = 2
test_df['product1_amount'].loc[test_df['product1_amount']=='level 3'] = 3
test_df['product1_amount'].loc[test_df['product1_amount']=='level 4'] = 4
test_df['product1_amount'].loc[test_df['product1_amount']=='level 5'] = 5
test_df['product1_amount'].loc[test_df['product1_amount']=='level 6'] = 6
test_df['product1_amount'].loc[test_df['product1_amount']=='level 7'] = 7

test_df['product2_amount'].loc[test_df['product2_amount']=='level 1'] = 1
test_df['product2_amount'].loc[test_df['product2_amount']=='level 2'] = 2
test_df['product2_amount'].loc[test_df['product2_amount']=='level 3'] = 3
test_df['product2_amount'].loc[test_df['product2_amount']=='level 4'] = 4
test_df['product2_amount'].loc[test_df['product2_amount']=='level 5'] = 5
test_df['product2_amount'].loc[test_df['product2_amount']=='level 6'] = 6
test_df['product2_amount'].loc[test_df['product2_amount']=='level 7'] = 7
test_df['product2_amount'].loc[test_df['product2_amount']=='level 8'] = 8
test_df['product2_amount'].loc[test_df['product2_amount']=='level 9'] = 9
test_df['product2_amount'].loc[test_df['product2_amount']=='level 10'] = 10
test_df['product2_amount'].loc[test_df['product2_amount']=='level 11'] = 11
test_df['product2_amount'].loc[test_df['product2_amount']=='level 12'] = 12
test_df['product2_amount'].loc[test_df['product2_amount']=='level 13'] = 13
test_df['product2_amount'].loc[test_df['product2_amount']=='level 14'] = 14
test_df['product2_amount'].loc[test_df['product2_amount']=='level 15'] = 15
test_df['product2_amount'].loc[test_df['product2_amount']=='level 16'] = 16
test_df['product2_amount'].loc[test_df['product2_amount']=='level 17'] = 17
test_df['product2_amount'].loc[test_df['product2_amount']=='level 18'] = 18
test_df['product2_amount'].loc[test_df['product2_amount']=='level 19'] = 19
test_df['product2_amount'].loc[test_df['product2_amount']=='level 20'] = 20
test_df['product2_amount'].loc[test_df['product2_amount']=='level 21'] = 21

test_df['product3_amount'].loc[test_df['product3_amount']=='level 1'] = 1
test_df['product3_amount'].loc[test_df['product3_amount']=='level 2'] = 2
test_df['product3_amount'].loc[test_df['product3_amount']=='level 3'] = 3

test_df['product4_amount'].loc[test_df['product4_amount']=='level 0'] = 0
test_df['product4_amount'].loc[test_df['product4_amount']=='level 1'] = 1

test_df['product5_amount'].loc[test_df['product5_amount']=='level 0'] = 0
test_df['product5_amount'].loc[test_df['product5_amount']=='level 1'] = 1

test_df['product6_amount'].loc[test_df['product6_amount']=='level 1'] = 1
test_df['product6_amount'].loc[test_df['product6_amount']=='level 2'] = 2
test_df['product6_amount'].loc[test_df['product6_amount']=='level 3'] = 3
test_df['product6_amount'].loc[test_df['product6_amount']=='level 4'] = 4
test_df['product6_amount'].loc[test_df['product6_amount']=='level 5'] = 5
test_df['product6_amount'].loc[test_df['product6_amount']=='level 6'] = 6
test_df['product6_amount'].loc[test_df['product6_amount']=='level 7'] = 7
test_df['product6_amount'].loc[test_df['product6_amount']=='level 8'] = 8
test_df['product6_amount'].loc[test_df['product6_amount']=='level 9'] = 9
test_df['product6_amount'].loc[test_df['product6_amount']=='level 10'] = 10
test_df['product6_amount'].loc[test_df['product6_amount']=='level 11'] = 11
test_df['product6_amount'].loc[test_df['product6_amount']=='level 12'] = 12
test_df['product6_amount'].loc[test_df['product6_amount']=='level 13'] = 13
test_df['product6_amount'].loc[test_df['product6_amount']=='level 14'] = 14
test_df['product6_amount'].loc[test_df['product6_amount']=='level 15'] = 15
test_df['product6_amount'].loc[test_df['product6_amount']=='level 16'] = 16
test_df['product6_amount'].loc[test_df['product6_amount']=='level 17'] = 17
test_df['product6_amount'].loc[test_df['product6_amount']=='level 18'] = 18
test_df['product6_amount'].loc[test_df['product6_amount']=='level 19'] = 19
test_df['product6_amount'].loc[test_df['product6_amount']=='level 20'] = 20
test_df['product6_amount'].loc[test_df['product6_amount']=='level 21'] = 21

删除类别较多的特征

train_df.drop(['province','provider','city','regist_type'],axis=1,inplace=True)
test_df.drop(['province','provider','city','regist_type'],axis=1,inplace=True)

更改数据类型

#改变数据类型
features = features = train_df[:1].drop('user',axis=1).columns
for i in features:
    if train_df[i].dtypes == object:
        train_df[i] = train_df[i].astype(int)
#改变数据类型
features = features = test_df[:1].drop('user',axis=1).columns
for i in features:
    if test_df[i].dtypes == object:
        test_df[i] = test_df[i].astype(int)

训练模型

#对模型进行训练并提交
drop_columns=["user","label"]
clf = lgb
train_count = train_df.shape[0]
#train_df = data_df[:train_count].copy().reset_index(drop=True)
#test_df = data_df[train_count:].copy().reset_index(drop=True)


features = train_df[:1].drop(drop_columns,axis=1).columns
train_x = train_df[features]
test_x = test_df[features]
test_x = test_x.fillna(method='ffill')
train_y = train_df['label']



train = np.zeros((train_x.shape[0], 1))
test = np.zeros((test_x.shape[0], 1))
    
nums = int(train_x.shape[0] * 0.90)
    
trn_x, trn_y, val_x, val_y = train_x[:nums], train_y[:nums], train_x[nums:], train_y[nums:]
    

train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
data_matrix  = clf.Dataset(train_x, label=train_y)
        
params = {
            'boosting_type': 'gbdt',
            'metric': {'binary_logloss', 'auc'},
            'min_child_weight': 5,
            'num_leaves': 2**6 ,#64
            'objective': 'binary',
            'feature_fraction': 0.9,
            'bagging_fraction': 0.7,
            'bagging_freq': 1,
            'learning_rate': 0.01,
            'seed': 520,
            'min_data_in_leaf': 500,
        }

model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500,early_stopping_rounds=1000)
model2 = clf.train(params, data_matrix, model.best_iteration)
val_pred = model.predict(val_x, num_iteration=model2.best_iteration).reshape(-1,1)
test_pred = model.predict(test_x, num_iteration=model2.best_iteration).reshape(-1,1)


lgb_train, lgb_test = val_pred,test_pred

提交文件

sub = pd.DataFrame()
sub['user'] = test_df.user
sub['prob'] = test_pred[:,0]
sub = sub.sort_values('user')
sub.to_csv('sub.csv',index=False)

本baseline只是简单的对数据进行了编码,还没有构造其他特征以及数据预处理等。

  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 10
    评论
评论 10
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值