导入包
# coding: utf-8
#导入包
import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error, mean_squared_error
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')
读取训练集数据
#读取训练集
train_base = pd.read_csv('train/train_base.csv')
train_op = pd.read_csv('train/train_op.csv')
train_trans = pd.read_csv('train/train_trans.csv')
train_label = pd.read_csv('train/train_label.csv')
读取测试集数据
#读取测试集
test_base = pd.read_csv('test/test_a_base.csv')
test_op = pd.read_csv('test/test_a_op.csv')
test_trans = pd.read_csv('test/test_a_trans.csv')
我们发现训练集和测试集中sevice3_level字段缺失较多,因此删除此列。然后删除训练集中的sex列以及balance_avg列为空的行,得到不含缺失值的训练集。
对于测试集,其他列缺失值很少,直接用上一行的数据补全。
#去除训练集缺失值
train_df = train_df[train_df['sex'].notnull()]
train_df = train_df[train_df['balance_avg'].notnull()]
train_df.drop('service3_level',axis=1,inplace=True)
train_df.head()
test_df.drop('service3_level',axis=1,inplace=True)
test_df.fillna(method='bfill',inplace=True)
test_df.info()
对于base数据集中的类别编码字段,采用label encode,对于有明显顺序关系和类别数目较少的特征,采用label编码;对于类别较多且没有顺序关系的特征,采取暂时舍弃特征的方式。
#label编码
train_df['sex'].loc[train_df['sex']=='category 1'] = 1
train_df['sex'].loc[train_df['sex']=='category 0'] = 0
train_df['level'].loc[train_df['level']=='category 0'] = 0
train_df['level'].loc[train_df['level']=='category 1'] = 1
train_df['level'].loc[train_df['level']=='category 2'] = 2
train_df['verified'].loc[train_df['verified']=='category 1'] = 1
train_df['verified'].loc[train_df['verified']=='category 0'] = 0
train_df['agreement1'].loc[train_df['agreement1']=='category 1'] = 1
train_df['agreement1'].loc[train_df['agreement1']=='category 0'] = 0
train_df['agreement2'].loc[train_df['agreement2']=='category 1'] = 1
train_df['agreement2'].loc[train_df['agreement2']=='category 0'] = 0
train_df['agreement3'].loc[train_df['agreement3']=='category 1'] = 1
train_df['agreement3'].loc[train_df['agreement3']=='category 0'] = 0
train_df['agreement4'].loc[train_df['agreement4']=='category 1'] = 1
train_df['agreement4'].loc[train_df['agreement4']=='category 0'] = 0
train_df['balance'].loc[train_df['balance']=='level 0'] = 0
train_df['balance'].loc[train_df['balance']=='level 1'] = 1
train_df['balance'].loc[train_df['balance']=='level 2'] = 2
train_df['balance'].loc[train_df['balance']=='level 3'] = 3
train_df['balance'].loc[train_df['balance']=='level 4'] = 4
train_df['balance'].loc[train_df['balance']=='level 5'] = 5
train_df['balance'].loc[train_df['balance']=='level 6'] = 6
train_df['balance'].loc[train_df['balance']=='level 7'] = 7
train_df['balance'].loc[train_df['balance']=='level 8'] = 8
train_df['balance'].loc[train_df['balance']=='level 9'] = 9
train_df['balance'].loc[train_df['balance']=='level 10'] = 10
train_df['balance'].loc[train_df['balance']=='level 11'] = 11
train_df['balance'].loc[train_df['balance']=='level 12'] = 12
train_df['balance'].loc[train_df['balance']=='level 13'] = 13
train_df['balance'].loc[train_df['balance']=='level 14'] = 14
train_df['balance'].loc[train_df['balance']=='level 15'] = 15
train_df['balance'].loc[train_df['balance']=='level 16'] = 16
train_df['balance'].loc[train_df['balance']=='level 17'] = 17
train_df['balance'].loc[train_df['balance']=='level 18'] = 18
train_df['balance'].loc[train_df['balance']=='level 19'] = 19
train_df['balance'].loc[train_df['balance']=='level 20'] = 20
train_df['balance'].loc[train_df['balance']=='level 21'] = 21
train_df['balance_avg'].loc[train_df['balance_avg']=='level 0'] = 0
train_df['balance_avg'].loc[train_df['balance_avg']=='level 1'] = 1
train_df['balance_avg'].loc[train_df['balance_avg']=='level 2'] = 2
train_df['balance_avg'].loc[train_df['balance_avg']=='level 3'] = 3
train_df['balance_avg'].loc[train_df['balance_avg']=='level 4'] = 4
train_df['balance_avg'].loc[train_df['balance_avg']=='level 5'] = 5
train_df['balance_avg'].loc[train_df['balance_avg']=='level 6'] = 6
train_df['balance_avg'].loc[train_df['balance_avg']=='level 7'] = 7
train_df['balance_avg'].loc[train_df['balance_avg']=='level 8'] = 8
train_df['balance_avg'].loc[train_df['balance_avg']=='level 9'] = 9
train_df['balance_avg'].loc[train_df['balance_avg']=='level 10'] = 10
train_df['balance_avg'].loc[train_df['balance_avg']=='level 11'] = 11
train_df['balance_avg'].loc[train_df['balance_avg']=='level 12'] = 12
train_df['balance_avg'].loc[train_df['balance_avg']=='level 13'] = 13
train_df['balance_avg'].loc[train_df['balance_avg']=='level 14'] = 14
train_df['balance_avg'].loc[train_df['balance_avg']=='level 15'] = 15
train_df['balance_avg'].loc[train_df['balance_avg']=='level 16'] = 16
train_df['balance_avg'].loc[train_df['balance_avg']=='level 17'] = 17
train_df['balance_avg'].loc[train_df['balance_avg']=='level 18'] = 18
train_df['balance_avg'].loc[train_df['balance_avg']=='level 19'] = 19
train_df['balance_avg'].loc[train_df['balance_avg']=='level 20'] = 20
train_df['balance_avg'].loc[train_df['balance_avg']=='level 21'] = 21
train_df['balance1'].loc[train_df['balance1']=='level 0'] = 0
train_df['balance1'].loc[train_df['balance1']=='level 1'] = 1
train_df['balance1'].loc[train_df['balance1']=='level 2'] = 2
train_df['balance1'].loc[train_df['balance1']=='level 3'] = 3
train_df['balance1'].loc[train_df['balance1']=='level 4'] = 4
train_df['balance1'].loc[train_df['balance1']=='level 5'] = 5
train_df['balance1'].loc[train_df['balance1']=='level 6'] = 6
train_df['balance1'].loc[train_df['balance1']=='level 7'] = 7
train_df['balance1'].loc[train_df['balance1']=='level 8'] = 8
train_df['balance1'].loc[train_df['balance1']=='level 9'] = 9
train_df['balance1'].loc[train_df['balance1']=='level 10'] = 10
train_df['balance1'].loc[train_df['balance1']=='level 11'] = 11
train_df['balance1'].loc[train_df['balance1']=='level 12'] = 12
train_df['balance1'].loc[train_df['balance1']=='level 13'] = 13
train_df['balance1'].loc[train_df['balance1']=='level 14'] = 14
train_df['balance1'].loc[train_df['balance1']=='level 15'] = 15
train_df['balance1'].loc[train_df['balance1']=='level 16'] = 16
train_df['balance1'].loc[train_df['balance1']=='level 17'] = 17
train_df['balance1'].loc[train_df['balance1']=='level 18'] = 18
train_df['balance1'].loc[train_df['balance1']=='level 19'] = 19
train_df['balance1'].loc[train_df['balance1']=='level 20'] = 20
train_df['balance1'].loc[train_df['balance1']=='level 21'] = 21
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 0'] = 0
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 1'] = 1
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 2'] = 2
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 3'] = 3
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 4'] = 4
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 5'] = 5
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 6'] = 6
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 7'] = 7
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 8'] = 8
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 9'] = 9
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 10'] = 10
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 11'] = 11
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 12'] = 12
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 13'] = 13
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 14'] = 14
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 15'] = 15
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 16'] = 16
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 17'] = 17
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 18'] = 18
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 19'] = 19
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 20'] = 20
train_df['balance1_avg'].loc[train_df['balance1_avg']=='level 21'] = 21
train_df['balance2'].loc[train_df['balance2']=='level 0'] = 0
train_df['balance2'].loc[train_df['balance2']=='level 1'] = 1
train_df['balance2'].loc[train_df['balance2']=='level 2'] = 2
train_df['balance2'].loc[train_df['balance2']=='level 3'] = 3
train_df['balance2'].loc[train_df['balance2']=='level 4'] = 4
train_df['balance2'].loc[train_df['balance2']=='level 5'] = 5
train_df['balance2'].loc[train_df['balance2']=='level 6'] = 6
train_df['balance2'].loc[train_df['balance2']=='level 7'] = 7
train_df['balance2'].loc[train_df['balance2']=='level 8'] = 8
train_df['balance2'].loc[train_df['balance2']=='level 9'] = 9
train_df['balance2'].loc[train_df['balance2']=='level 10'] = 10
train_df['balance2'].loc[train_df['balance2']=='level 11'] = 11
train_df['balance2'].loc[train_df['balance2']=='level 12'] = 12
train_df['balance2'].loc[train_df['balance2']=='level 13'] = 13
train_df['balance2'].loc[train_df['balance2']=='level 14'] = 14
train_df['balance2'].loc[train_df['balance2']=='level 15'] = 15
train_df['balance2'].loc[train_df['balance2']=='level 16'] = 16
train_df['balance2'].loc[train_df['balance2']=='level 17'] = 17
train_df['balance2'].loc[train_df['balance2']=='level 18'] = 18
train_df['balance2'].loc[train_df['balance2']=='level 19'] = 19
train_df['balance2'].loc[train_df['balance2']=='level 20'] = 20
train_df['balance2'].loc[train_df['balance2']=='level 21'] = 21
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 0'] = 0
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 1'] = 1
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 2'] = 2
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 3'] = 3
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 4'] = 4
train_df['balance2_avg'].loc[train_df['balance2_avg']=='level 5'] = 5
train_df['service3'].loc[train_df['service3']=='category 1'] = 1
train_df['service3'].loc[train_df['service3']=='category 0'] = 0
train_df['product1_amount'].loc[train_df['product1_amount']=='level 1'] = 1
train_df['product1_amount'].loc[train_df['product1_amount']=='level 2'] = 2
train_df['product1_amount'].loc[train_df['product1_amount']=='level 3'] = 3
train_df['product1_amount'].loc[train_df['product1_amount']=='level 4'] = 4
train_df['product1_amount'].loc[train_df['product1_amount']=='level 5'] = 5
train_df['product1_amount'].loc[train_df['product1_amount']=='level 6'] = 6
train_df['product1_amount'].loc[train_df['product1_amount']=='level 7'] = 7
train_df['product2_amount'].loc[train_df['product2_amount']=='level 1'] = 1
train_df['product2_amount'].loc[train_df['product2_amount']=='level 2'] = 2
train_df['product2_amount'].loc[train_df['product2_amount']=='level 3'] = 3
train_df['product2_amount'].loc[train_df['product2_amount']=='level 4'] = 4
train_df['product2_amount'].loc[train_df['product2_amount']=='level 5'] = 5
train_df['product2_amount'].loc[train_df['product2_amount']=='level 6'] = 6
train_df['product2_amount'].loc[train_df['product2_amount']=='level 7'] = 7
train_df['product2_amount'].loc[train_df['product2_amount']=='level 8'] = 8
train_df['product2_amount'].loc[train_df['product2_amount']=='level 9'] = 9
train_df['product2_amount'].loc[train_df['product2_amount']=='level 10'] = 10
train_df['product2_amount'].loc[train_df['product2_amount']=='level 11'] = 11
train_df['product2_amount'].loc[train_df['product2_amount']=='level 12'] = 12
train_df['product2_amount'].loc[train_df['product2_amount']=='level 13'] = 13
train_df['product2_amount'].loc[train_df['product2_amount']=='level 14'] = 14
train_df['product2_amount'].loc[train_df['product2_amount']=='level 15'] = 15
train_df['product2_amount'].loc[train_df['product2_amount']=='level 16'] = 16
train_df['product2_amount'].loc[train_df['product2_amount']=='level 17'] = 17
train_df['product2_amount'].loc[train_df['product2_amount']=='level 18'] = 18
train_df['product2_amount'].loc[train_df['product2_amount']=='level 19'] = 19
train_df['product2_amount'].loc[train_df['product2_amount']=='level 20'] = 20
train_df['product2_amount'].loc[train_df['product2_amount']=='level 21'] = 21
train_df['product3_amount'].loc[train_df['product3_amount']=='level 1'] = 1
train_df['product3_amount'].loc[train_df['product3_amount']=='level 2'] = 2
train_df['product3_amount'].loc[train_df['product3_amount']=='level 3'] = 3
train_df['product4_amount'].loc[train_df['product4_amount']=='level 0'] = 0
train_df['product4_amount'].loc[train_df['product4_amount']=='level 1'] = 1
train_df['product5_amount'].loc[train_df['product5_amount']=='level 0'] = 0
train_df['product5_amount'].loc[train_df['product5_amount']=='level 1'] = 1
train_df['product6_amount'].loc[train_df['product6_amount']=='level 1'] = 1
train_df['product6_amount'].loc[train_df['product6_amount']=='level 2'] = 2
train_df['product6_amount'].loc[train_df['product6_amount']=='level 3'] = 3
train_df['product6_amount'].loc[train_df['product6_amount']=='level 4'] = 4
train_df['product6_amount'].loc[train_df['product6_amount']=='level 5'] = 5
train_df['product6_amount'].loc[train_df['product6_amount']=='level 6'] = 6
train_df['product6_amount'].loc[train_df['product6_amount']=='level 7'] = 7
train_df['product6_amount'].loc[train_df['product6_amount']=='level 8'] = 8
train_df['product6_amount'].loc[train_df['product6_amount']=='level 9'] = 9
train_df['product6_amount'].loc[train_df['product6_amount']=='level 10'] = 10
train_df['product6_amount'].loc[train_df['product6_amount']=='level 11'] = 11
train_df['product6_amount'].loc[train_df['product6_amount']=='level 12'] = 12
train_df['product6_amount'].loc[train_df['product6_amount']=='level 13'] = 13
train_df['product6_amount'].loc[train_df['product6_amount']=='level 14'] = 14
train_df['product6_amount'].loc[train_df['product6_amount']=='level 15'] = 15
train_df['product6_amount'].loc[train_df['product6_amount']=='level 16'] = 16
train_df['product6_amount'].loc[train_df['product6_amount']=='level 17'] = 17
train_df['product6_amount'].loc[train_df['product6_amount']=='level 18'] = 18
train_df['product6_amount'].loc[train_df['product6_amount']=='level 19'] = 19
train_df['product6_amount'].loc[train_df['product6_amount']=='level 20'] = 20
train_df['product6_amount'].loc[train_df['product6_amount']=='level 21'] = 21
test_df['sex'].loc[test_df['sex']=='category 1'] = 1
test_df['sex'].loc[test_df['sex']=='category 0'] = 0
test_df['level'].loc[test_df['level']=='category 0'] = 0
test_df['level'].loc[test_df['level']=='category 1'] = 1
test_df['level'].loc[test_df['level']=='category 2'] = 2
test_df['verified'].loc[test_df['verified']=='category 1'] = 1
test_df['verified'].loc[test_df['verified']=='category 0'] = 0
test_df['agreement1'].loc[test_df['agreement1']=='category 1'] = 1
test_df['agreement1'].loc[test_df['agreement1']=='category 0'] = 0
test_df['agreement2'].loc[test_df['agreement2']=='category 1'] = 1
test_df['agreement2'].loc[test_df['agreement2']=='category 0'] = 0
test_df['agreement3'].loc[test_df['agreement3']=='category 1'] = 1
test_df['agreement3'].loc[test_df['agreement3']=='category 0'] = 0
test_df['agreement4'].loc[test_df['agreement4']=='category 1'] = 1
test_df['agreement4'].loc[test_df['agreement4']=='category 0'] = 0
test_df['balance'].loc[test_df['balance']=='level 0'] = 0
test_df['balance'].loc[test_df['balance']=='level 1'] = 1
test_df['balance'].loc[test_df['balance']=='level 2'] = 2
test_df['balance'].loc[test_df['balance']=='level 3'] = 3
test_df['balance'].loc[test_df['balance']=='level 4'] = 4
test_df['balance'].loc[test_df['balance']=='level 5'] = 5
test_df['balance'].loc[test_df['balance']=='level 6'] = 6
test_df['balance'].loc[test_df['balance']=='level 7'] = 7
test_df['balance'].loc[test_df['balance']=='level 8'] = 8
test_df['balance'].loc[test_df['balance']=='level 9'] = 9
test_df['balance'].loc[test_df['balance']=='level 10'] = 10
test_df['balance'].loc[test_df['balance']=='level 11'] = 11
test_df['balance'].loc[test_df['balance']=='level 12'] = 12
test_df['balance'].loc[test_df['balance']=='level 13'] = 13
test_df['balance'].loc[test_df['balance']=='level 14'] = 14
test_df['balance'].loc[test_df['balance']=='level 15'] = 15
test_df['balance'].loc[test_df['balance']=='level 16'] = 16
test_df['balance'].loc[test_df['balance']=='level 17'] = 17
test_df['balance'].loc[test_df['balance']=='level 18'] = 18
test_df['balance'].loc[test_df['balance']=='level 19'] = 19
test_df['balance'].loc[test_df['balance']=='level 20'] = 20
test_df['balance'].loc[test_df['balance']=='level 21'] = 21
test_df['balance_avg'].loc[test_df['balance_avg']=='level 0'] = 0
test_df['balance_avg'].loc[test_df['balance_avg']=='level 1'] = 1
test_df['balance_avg'].loc[test_df['balance_avg']=='level 2'] = 2
test_df['balance_avg'].loc[test_df['balance_avg']=='level 3'] = 3
test_df['balance_avg'].loc[test_df['balance_avg']=='level 4'] = 4
test_df['balance_avg'].loc[test_df['balance_avg']=='level 5'] = 5
test_df['balance_avg'].loc[test_df['balance_avg']=='level 6'] = 6
test_df['balance_avg'].loc[test_df['balance_avg']=='level 7'] = 7
test_df['balance_avg'].loc[test_df['balance_avg']=='level 8'] = 8
test_df['balance_avg'].loc[test_df['balance_avg']=='level 9'] = 9
test_df['balance_avg'].loc[test_df['balance_avg']=='level 10'] = 10
test_df['balance_avg'].loc[test_df['balance_avg']=='level 11'] = 11
test_df['balance_avg'].loc[test_df['balance_avg']=='level 12'] = 12
test_df['balance_avg'].loc[test_df['balance_avg']=='level 13'] = 13
test_df['balance_avg'].loc[test_df['balance_avg']=='level 14'] = 14
test_df['balance_avg'].loc[test_df['balance_avg']=='level 15'] = 15
test_df['balance_avg'].loc[test_df['balance_avg']=='level 16'] = 16
test_df['balance_avg'].loc[test_df['balance_avg']=='level 17'] = 17
test_df['balance_avg'].loc[test_df['balance_avg']=='level 18'] = 18
test_df['balance_avg'].loc[test_df['balance_avg']=='level 19'] = 19
test_df['balance_avg'].loc[test_df['balance_avg']=='level 20'] = 20
test_df['balance_avg'].loc[test_df['balance_avg']=='level 21'] = 21
test_df['balance1'].loc[test_df['balance1']=='level 0'] = 0
test_df['balance1'].loc[test_df['balance1']=='level 1'] = 1
test_df['balance1'].loc[test_df['balance1']=='level 2'] = 2
test_df['balance1'].loc[test_df['balance1']=='level 3'] = 3
test_df['balance1'].loc[test_df['balance1']=='level 4'] = 4
test_df['balance1'].loc[test_df['balance1']=='level 5'] = 5
test_df['balance1'].loc[test_df['balance1']=='level 6'] = 6
test_df['balance1'].loc[test_df['balance1']=='level 7'] = 7
test_df['balance1'].loc[test_df['balance1']=='level 8'] = 8
test_df['balance1'].loc[test_df['balance1']=='level 9'] = 9
test_df['balance1'].loc[test_df['balance1']=='level 10'] = 10
test_df['balance1'].loc[test_df['balance1']=='level 11'] = 11
test_df['balance1'].loc[test_df['balance1']=='level 12'] = 12
test_df['balance1'].loc[test_df['balance1']=='level 13'] = 13
test_df['balance1'].loc[test_df['balance1']=='level 14'] = 14
test_df['balance1'].loc[test_df['balance1']=='level 15'] = 15
test_df['balance1'].loc[test_df['balance1']=='level 16'] = 16
test_df['balance1'].loc[test_df['balance1']=='level 17'] = 17
test_df['balance1'].loc[test_df['balance1']=='level 18'] = 18
test_df['balance1'].loc[test_df['balance1']=='level 19'] = 19
test_df['balance1'].loc[test_df['balance1']=='level 20'] = 20
test_df['balance1'].loc[test_df['balance1']=='level 21'] = 21
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 0'] = 0
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 1'] = 1
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 2'] = 2
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 3'] = 3
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 4'] = 4
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 5'] = 5
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 6'] = 6
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 7'] = 7
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 8'] = 8
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 9'] = 9
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 10'] = 10
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 11'] = 11
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 12'] = 12
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 13'] = 13
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 14'] = 14
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 15'] = 15
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 16'] = 16
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 17'] = 17
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 18'] = 18
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 19'] = 19
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 20'] = 20
test_df['balance1_avg'].loc[test_df['balance1_avg']=='level 21'] = 21
test_df['balance2'].loc[test_df['balance2']=='level 0'] = 0
test_df['balance2'].loc[test_df['balance2']=='level 1'] = 1
test_df['balance2'].loc[test_df['balance2']=='level 2'] = 2
test_df['balance2'].loc[test_df['balance2']=='level 3'] = 3
test_df['balance2'].loc[test_df['balance2']=='level 4'] = 4
test_df['balance2'].loc[test_df['balance2']=='level 5'] = 5
test_df['balance2'].loc[test_df['balance2']=='level 6'] = 6
test_df['balance2'].loc[test_df['balance2']=='level 7'] = 7
test_df['balance2'].loc[test_df['balance2']=='level 8'] = 8
test_df['balance2'].loc[test_df['balance2']=='level 9'] = 9
test_df['balance2'].loc[test_df['balance2']=='level 10'] = 10
test_df['balance2'].loc[test_df['balance2']=='level 11'] = 11
test_df['balance2'].loc[test_df['balance2']=='level 12'] = 12
test_df['balance2'].loc[test_df['balance2']=='level 13'] = 13
test_df['balance2'].loc[test_df['balance2']=='level 14'] = 14
test_df['balance2'].loc[test_df['balance2']=='level 15'] = 15
test_df['balance2'].loc[test_df['balance2']=='level 16'] = 16
test_df['balance2'].loc[test_df['balance2']=='level 17'] = 17
test_df['balance2'].loc[test_df['balance2']=='level 18'] = 18
test_df['balance2'].loc[test_df['balance2']=='level 19'] = 19
test_df['balance2'].loc[test_df['balance2']=='level 20'] = 20
test_df['balance2'].loc[test_df['balance2']=='level 21'] = 21
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 0'] = 0
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 1'] = 1
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 2'] = 2
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 3'] = 3
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 4'] = 4
test_df['balance2_avg'].loc[test_df['balance2_avg']=='level 5'] = 5
test_df['service3'].loc[test_df['service3']=='category 1'] = 1
test_df['service3'].loc[test_df['service3']=='category 0'] = 0
test_df
test_df['product1_amount'].loc[test_df['product1_amount']=='level 1'] = 1
test_df['product1_amount'].loc[test_df['product1_amount']=='level 2'] = 2
test_df['product1_amount'].loc[test_df['product1_amount']=='level 3'] = 3
test_df['product1_amount'].loc[test_df['product1_amount']=='level 4'] = 4
test_df['product1_amount'].loc[test_df['product1_amount']=='level 5'] = 5
test_df['product1_amount'].loc[test_df['product1_amount']=='level 6'] = 6
test_df['product1_amount'].loc[test_df['product1_amount']=='level 7'] = 7
test_df['product2_amount'].loc[test_df['product2_amount']=='level 1'] = 1
test_df['product2_amount'].loc[test_df['product2_amount']=='level 2'] = 2
test_df['product2_amount'].loc[test_df['product2_amount']=='level 3'] = 3
test_df['product2_amount'].loc[test_df['product2_amount']=='level 4'] = 4
test_df['product2_amount'].loc[test_df['product2_amount']=='level 5'] = 5
test_df['product2_amount'].loc[test_df['product2_amount']=='level 6'] = 6
test_df['product2_amount'].loc[test_df['product2_amount']=='level 7'] = 7
test_df['product2_amount'].loc[test_df['product2_amount']=='level 8'] = 8
test_df['product2_amount'].loc[test_df['product2_amount']=='level 9'] = 9
test_df['product2_amount'].loc[test_df['product2_amount']=='level 10'] = 10
test_df['product2_amount'].loc[test_df['product2_amount']=='level 11'] = 11
test_df['product2_amount'].loc[test_df['product2_amount']=='level 12'] = 12
test_df['product2_amount'].loc[test_df['product2_amount']=='level 13'] = 13
test_df['product2_amount'].loc[test_df['product2_amount']=='level 14'] = 14
test_df['product2_amount'].loc[test_df['product2_amount']=='level 15'] = 15
test_df['product2_amount'].loc[test_df['product2_amount']=='level 16'] = 16
test_df['product2_amount'].loc[test_df['product2_amount']=='level 17'] = 17
test_df['product2_amount'].loc[test_df['product2_amount']=='level 18'] = 18
test_df['product2_amount'].loc[test_df['product2_amount']=='level 19'] = 19
test_df['product2_amount'].loc[test_df['product2_amount']=='level 20'] = 20
test_df['product2_amount'].loc[test_df['product2_amount']=='level 21'] = 21
test_df['product3_amount'].loc[test_df['product3_amount']=='level 1'] = 1
test_df['product3_amount'].loc[test_df['product3_amount']=='level 2'] = 2
test_df['product3_amount'].loc[test_df['product3_amount']=='level 3'] = 3
test_df['product4_amount'].loc[test_df['product4_amount']=='level 0'] = 0
test_df['product4_amount'].loc[test_df['product4_amount']=='level 1'] = 1
test_df['product5_amount'].loc[test_df['product5_amount']=='level 0'] = 0
test_df['product5_amount'].loc[test_df['product5_amount']=='level 1'] = 1
test_df['product6_amount'].loc[test_df['product6_amount']=='level 1'] = 1
test_df['product6_amount'].loc[test_df['product6_amount']=='level 2'] = 2
test_df['product6_amount'].loc[test_df['product6_amount']=='level 3'] = 3
test_df['product6_amount'].loc[test_df['product6_amount']=='level 4'] = 4
test_df['product6_amount'].loc[test_df['product6_amount']=='level 5'] = 5
test_df['product6_amount'].loc[test_df['product6_amount']=='level 6'] = 6
test_df['product6_amount'].loc[test_df['product6_amount']=='level 7'] = 7
test_df['product6_amount'].loc[test_df['product6_amount']=='level 8'] = 8
test_df['product6_amount'].loc[test_df['product6_amount']=='level 9'] = 9
test_df['product6_amount'].loc[test_df['product6_amount']=='level 10'] = 10
test_df['product6_amount'].loc[test_df['product6_amount']=='level 11'] = 11
test_df['product6_amount'].loc[test_df['product6_amount']=='level 12'] = 12
test_df['product6_amount'].loc[test_df['product6_amount']=='level 13'] = 13
test_df['product6_amount'].loc[test_df['product6_amount']=='level 14'] = 14
test_df['product6_amount'].loc[test_df['product6_amount']=='level 15'] = 15
test_df['product6_amount'].loc[test_df['product6_amount']=='level 16'] = 16
test_df['product6_amount'].loc[test_df['product6_amount']=='level 17'] = 17
test_df['product6_amount'].loc[test_df['product6_amount']=='level 18'] = 18
test_df['product6_amount'].loc[test_df['product6_amount']=='level 19'] = 19
test_df['product6_amount'].loc[test_df['product6_amount']=='level 20'] = 20
test_df['product6_amount'].loc[test_df['product6_amount']=='level 21'] = 21
删除类别较多的特征
train_df.drop(['province','provider','city','regist_type'],axis=1,inplace=True)
test_df.drop(['province','provider','city','regist_type'],axis=1,inplace=True)
更改数据类型
#改变数据类型
features = features = train_df[:1].drop('user',axis=1).columns
for i in features:
if train_df[i].dtypes == object:
train_df[i] = train_df[i].astype(int)
#改变数据类型
features = features = test_df[:1].drop('user',axis=1).columns
for i in features:
if test_df[i].dtypes == object:
test_df[i] = test_df[i].astype(int)
训练模型
#对模型进行训练并提交
drop_columns=["user","label"]
clf = lgb
train_count = train_df.shape[0]
#train_df = data_df[:train_count].copy().reset_index(drop=True)
#test_df = data_df[train_count:].copy().reset_index(drop=True)
features = train_df[:1].drop(drop_columns,axis=1).columns
train_x = train_df[features]
test_x = test_df[features]
test_x = test_x.fillna(method='ffill')
train_y = train_df['label']
train = np.zeros((train_x.shape[0], 1))
test = np.zeros((test_x.shape[0], 1))
nums = int(train_x.shape[0] * 0.90)
trn_x, trn_y, val_x, val_y = train_x[:nums], train_y[:nums], train_x[nums:], train_y[nums:]
train_matrix = clf.Dataset(trn_x, label=trn_y)
valid_matrix = clf.Dataset(val_x, label=val_y)
data_matrix = clf.Dataset(train_x, label=train_y)
params = {
'boosting_type': 'gbdt',
'metric': {'binary_logloss', 'auc'},
'min_child_weight': 5,
'num_leaves': 2**6 ,#64
'objective': 'binary',
'feature_fraction': 0.9,
'bagging_fraction': 0.7,
'bagging_freq': 1,
'learning_rate': 0.01,
'seed': 520,
'min_data_in_leaf': 500,
}
model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=500,early_stopping_rounds=1000)
model2 = clf.train(params, data_matrix, model.best_iteration)
val_pred = model.predict(val_x, num_iteration=model2.best_iteration).reshape(-1,1)
test_pred = model.predict(test_x, num_iteration=model2.best_iteration).reshape(-1,1)
lgb_train, lgb_test = val_pred,test_pred
提交文件
sub = pd.DataFrame()
sub['user'] = test_df.user
sub['prob'] = test_pred[:,0]
sub = sub.sort_values('user')
sub.to_csv('sub.csv',index=False)
本baseline只是简单的对数据进行了编码,还没有构造其他特征以及数据预处理等。