一. 随机森林模型
为了尝试并改良我们基线的不好的性能,我们可以更新算法。让我们尝试在相同的训练数据上使用一个随机森林,看看它如何影响性能。随机森林是一个更强大的模型特别是当我们使用数百棵树的时候。我们将在随机森林中使用100棵树。
# 导入基本处理模块
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import os
import warnings
warnings.filterwarnings('ignore')
import matplotlib.pyplot as plt
import seaborn as sns
# 输入处理缺失值
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
app_train = pd.read_csv('E:/home_credit_default_risk/application_train.csv')
app_test = pd.read_csv('E:/home_credit_default_risk/application_test.csv')
train_labels = app_train['TARGET']
app_train = app_train.select_dtypes(['number'])
app_test = app_test.select_dtypes(['number'])
# 从训练数据中删除目标
if 'TARGET' in app_train:
train = app_train.drop(columns=['TARGET'])
else:
train = app_train.copy()
# 特征名称
features = list(train.columns)
# 复制测试集
test = app_test.copy()
# 缺失值的中值估算
imputer = SimpleImputer(strategy='median')
# 将每个特性缩放到0-1
scaler = MinMaxScaler(feature_range=(0, 1))
# 符合训练数据
imputer.fit(train)
# 转换训练和测试数据
train = imputer.transform(train)
test = imputer.transform(app_test)
# 重复的标量
scaler.fit(train)
train = scaler.transform(train)
test = scaler.transform(test)
from sklearn.ensemble import RandomForestClassifier
# 建立随机森林分类器
random_forest = RandomForestClassifier(n_estimators = 100, random_state = 50, verbose = 1, n_jobs = -1)
# Train on the training data
random_forest.fit(train, train_labels)
# 提取重要特征
feature_importance_values = random_forest.feature_importances_
feature_importances = pd.DataFrame({'feature': features, 'importance': feature_importance_values})
# 对测试数据进行预测
predictions = random_forest.predict_proba(test)[:, 1]
# 做一个提交用的dataframe
submit = app_test[['SK_ID_CURR']]
submit['TARGET'] = predictions
# 保存提交用的dataframe
submit.to_csv('random_forest_baseline.csv', index = False)