import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
# from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')
data_train =pd.read_csv('./train.csv')
data_train1 = data_train[data_train['isDefault']==1][:50000]
data_train2 = data_train[data_train['isDefault']==0][:50000]
data_train = pd.concat([data_train1, data_train2])
data_test_a = pd.read_csv('./testA.csv')[:50000]
data_train.head()
# 数值特征和类别特征
numerical_fea = list(data_train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea,list(data_train.columns)))
label = 'isDefault'
numerical_fea.remove(label)
#查看缺失值情况
data_train.isnull().sum()[data_train.isnull().sum()>0]
#按照中位数填充数值型特征
data_train[numerical_fea] = data_train[numerical_fea].fillna(data_train[numerical_fea].median())
data_test_a[numerical_fea] = data_test_a[numerical_fea].fillna(data_train[numerical_fea].median())
#按照众数填充类别型特征
data_train[category_fea] = data_train[category_fea].fillna(data_train[category_fea].mode())
data_test_a[category_fea] = data_test_a[category_fea].fillna(data_train[category_fea].mode())
# 处理employmentLength
def employmentLength_to_int(s):
if pd.isnull(s):
return s
else:
return np.int8(s.split()[0])
for data in [data_train, data_test_a]:
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
#查看缺失值情况
data_train.isnull().sum()[data_train.isnull().sum()>0]
data_train[['employmentLength']] = data_train[['employmentLength']].fillna(10.0) # 10 年这个类别最多
data_test_a[['employmentLength']] = data_train[['employmentLength']].fillna(10.0) # 10 年这个类别最多
#查看缺失值情况
data_train.isnull().sum()[data_train.isnull().sum()>0]
# 处理 earliesCreditLine
for data in [data_train, data_test_a]:
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
#label-encode:subGrade,postCode,title
# 高维类别特征需要进行转换
for col in tqdm(['employmentTitle', 'postCode', 'title','subGrade']):
le = LabelEncoder()
le.fit(list(data_train[col].astype(str).values) + list(data_test_a[col].astype(str).values))
data_train[col] = le.transform(list(data_train[col].astype(str).values))
data_test_a[col] = le.transform(list(data_test_a[col].astype(str).values))
print('Label Encoding 完成')
#转化成时间格式
for data in [data_train, data_test_a]:
data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
#构造时间特征
data['issueDate'] = data['issueDate'].apply(lambda x: x-startdate).dt.days
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
drop_fields = [
"policyCode", #离散型变量,无用,全部一个值
"n11", #离散型变量,相差悬殊,用不用再分析
"n12" # #离散型变量,相差悬殊,用不用再分析
]
fields = [f for f in data_train.columns if f not in drop_fields]
feature = [f for f in fields if f != 'isDefault']
data_train.shape, data_test_a.shape
x_train = data_train[feature]
x_test = data_test_a[feature]
x_test[x_test.select_dtypes(include=[np.float64]).columns] = x_test.select_dtypes(include=[np.float64]).astype(np.float32)
y_train = data_train['isDefault']
x_train, x_val, y_train, y_val = train_test_split(
x_train, y_train, test_size=0.33, random_state=42)
clf = RandomForestClassifier(max_depth=3, random_state=0, n_estimators=500)
clf.fit(x_train, y_train)
clf.score(x_val, y_val) # 0.8622424242424243
clf = RandomForestClassifier(max_depth=1, random_state=0, n_estimators=10,max_features='log2')
clf.fit(x_train, y_train)
clf.score(x_val, y_val) # 0.8495454545454545
clf2 = RandomForestClassifier(max_depth=3, random_state=0, n_estimators=200, max_features='log2')
clf2.fit(x_train, y_train)
clf2.score(x_val, y_val) # 0.8633030303030303
clf.predict(x_test)
数据分析案例2: 贷款违约预测
最新推荐文章于 2023-05-08 09:45:00 发布