import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')
for data in[train,testA]:
data['earliesCreditLine']= data['earliesCreditLine'].apply(lambda s:int(s[-4:]))
类别特征处理
cate_features =['grade','subGrade','employmentTitle','homeOwnership','verificationStatus','purpose','postCode','regionCode','applicationType','initialListStatus','title','policyCode']for f in cate_features:print(f,'类型数:', data[f].nunique())
异常值处理
检测异常的方法:均方差
deffind_outliers_by_3segama(data,fea):
data_std = np.std(data[fea])
data_mean = np.mean(data[fea])
outliers_cut_off = data_std*3
lower_rule = data_mean - outliers_cut_off
upper_rule = data_mean + outliers_cut_off
data[fea+'_outliers']= data[fea].apply(lambda x:str('异常值')if x > upper_rule or x < lower_rule else'正常值')return data
data_train = data_train.copy()for fea in numerical_fea:
data_train = find_outliers_by_3segama(data_train,fea)print(data_train[fea+'_outliers'].value_counts())print(data_train.groupby(fea+'_outliers')['isDefault'].sum())print('*'*10)
#label-encode:subGrade,postCode,title# 高维类别特征需要进行转换for col in tqdm(['employmentTitle','postCode','title','subGrade']):
le = LabelEncoder()
le.fit(list(data_train[col].astype(str).values)+list(data_test_a[col].astype(str).values))
data_train[col]= le.transform(list(data_train[col].astype(str).values))
data_test_a[col]= le.transform(list(data_test_a[col].astype(str).values))print('Label Encoding 完成')
特征选择
Filter a. 方差选择法 b. 相关系数法(pearson 相关系数) c. 卡方检验 d. 互信息法