任务:分别用IV值和随机森林进行特征选择。然后分别构建模型(逻辑回归、SVM、决策树、随机森林、GBDT、XGBoost和LightGBM),进行模型评估。
导入模块
import pandas as pd
from pandas import DataFrame as df
from numpy import log
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
import matplotlib.pyplot as plt
IV值的计算
def calcWOE(dataset, col, target):
# 对特征进行统计分组
subdata = df(dataset.groupby(col)[col].count())
# 每个分组中响应客户的数量
suby = df(dataset.groupby(col)[target].sum())
# subdata 与 suby 的拼接
data = df(pd.merge(subdata, suby, how='left', left_index=True, right_index=True))
# 相关统计,总共的样本数量total,响应客户总数b_total,未响应客户数量g_total
b_total = data[target].sum()
total = data[col].sum()
g_total = total - b_total
# WOE公式
data["bad"] = data.apply(lambda x:round(x[target]/b_total, 100), axis=1)
data["good"] = data.apply(lambda x:round((x[col] - x[target])/g_total, 100), axis=1)
data["WOE"] = data.apply(lambda x:log(x.bad / x.good), axis=1)
return data.loc[:, ["bad", "good", "WOE"]]
def calcIV(dataset):
print()
dataset["IV"] = dataset.apply(lambda x:(x["bad"] - x["good"]) * x["WOE"], axis=1)
IV = sum(dataset["IV"])
return IV
y = data.status
x= data.drop('status', axis=1)
col_list = [col for col in data.drop(labels=['Unnamed: 0','status'], axis=1)]
data_IV = df()
fea_iv = []
for col in col_list:
col_WOE = calcWOE(data, col, "status")
# 删除nan、inf、-inf
col_WOE = col_WOE[~col_WOE.isin([np.nan, np.inf, -np.inf]).any(1)]
col_IV = calcIV(col_WOE)
if col_IV > 0.1:
data_IV[col] = [col_IV]
fea_iv.append(col)
data_IV.to_csv('F:\课程\AI\data_IV.csv', index=0)
print(fea_iv)
使用随机森林进行特征选择
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(random_state=2018)
rfc.fit(x, y)
importance = pd.Series(rfc.feature_importances_, index=x.columns).sort_values(ascending=False)
rfc_result = importance[: 20].index.tolist()
print(rfc_result)
特征合并
features = list(set(rfc_result) | set(fea_iv))
X_final = x[features]
print(X_final.shape)
定义评分函数
def get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba):
train_accuracy = metrics.accuracy_score(y_train, y_train_predict)
test_accuracy = metrics.accuracy_score(y_test, y_test_predict)
# 精准率
train_precision = metrics.precision_score(y_train, y_train_predict)
test_precision = metrics.precision_score(y_test, y_test_predict)
# 召回率
train_recall = metrics.recall_score(y_train, y_train_predict)
test_recall = metrics.recall_score(y_test, y_test_predict)
# F1-score
train_f1_score = metrics.f1_score(y_train, y_train_predict)
test_f1_score = metrics.f1_score(y_test, y_test_predict)
# AUC
train_auc = metrics.roc_auc_score(y_train, y_train_proba)
test_auc = metrics.roc_auc_score(y_test, y_test_proba)
# ROC
train_fprs, train_tprs, train_thresholds = metrics.roc_curve(y_train, y_train_proba)
test_fprs, test_tprs, test_thresholds = metrics.roc_curve(y_test, y_test_proba)
plt.plot(train_fprs, train_tprs)
plt.plot(test_fprs, test_tprs)
plt.title("ROC Curve")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.show()
# 输出评分
print("训练集准确率:", train_accuracy)
print("测试集准确率:", test_accuracy)
print("训练集精准率:", train_precision)
print("测试集精准率:", test_precision)
print("训练集召回率:", train_recall)
print("测试集召回率:", test_recall)
print("训练集F1-score:", train_f1_score)
print("测试集F1-score:", test_f1_score)
print("训练集AUC:", train_auc)
print("测试集AUC:", test_auc)
逻辑回归
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(random_state=2018)
lr.fit(x_train_standard, y_train)
y_train_predict = lr.predict(x_train_standard)
y_test_predict = lr.predict(x_test_standard)
y_train_proba = lr.predict_proba(x_train_standard)[:, 1]
y_test_proba = lr.predict_proba(x_test_standard)[:, 1]
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
SVM
from sklearn.svm import LinearSVC
svm_linearSVC = LinearSVC(random_state=2018)
svm_linearSVC.fit(x_train_standard, y_train)
y_train_predict = svm_linearSVC.predict(x_train_standard)
y_test_predict = svm_linearSVC.predict(x_test_standard)
y_train_proba = svm_linearSVC.decision_function(x_train_standard)
y_test_proba = svm_linearSVC.decision_function(x_test_standard)
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
决策树
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(random_state=2018)
tree.fit(x_train_standard, y_train)
y_train_predict = tree.predict(x_train_standard)
y_test_predict = tree.predict(x_test_standard)
y_train_proba = tree.predict_proba(x_train_standard)[:, 1]
y_test_proba = tree.predict_proba(x_test_standard)[:, 1]
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
随机森林
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=2018)
rf.fit(x_train_standard, y_train)
y_train_predict = rf.predict(x_train_standard)
y_test_predict = rf.predict(x_test_standard)
y_train_proba = rf.predict_proba(x_train_standard)[:, 1]
y_test_proba = rf.predict_proba(x_test_standard)[:, 1]
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
GBDT
from sklearn.ensemble import GradientBoostingClassifier
gb = GradientBoostingClassifier(random_state=2018)
gb.fit(x_train_standard, y_train)
y_train_predict = gb.predict(x_train_standard)
y_test_predict = gb.predict(x_test_standard)
y_train_proba = gb.predict_proba(x_train_standard)[:, 1]
y_test_proba = gb.predict_proba(x_test_standard)[:, 1]
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
XGBoost
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=2018)
xgb.fit(x_train_standard, y_train)
y_train_predict = xgb.predict(x_train_standard)
y_test_predict = xgb.predict(x_test_standard)
y_train_proba = xgb.predict_proba(x_train_standard)[:, 1]
y_test_proba = xgb.predict_proba(x_test_standard)[:, 1]
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)
LightGBM
from lightgbm import LGBMClassifier
lg = LGBMClassifier(random_state=2018)
lg.fit(x_train_standard, y_train)
y_train_predict = lg.predict(x_train_standard)
y_test_predict = lg.predict(x_test_standard)
y_train_proba = lg.predict_proba(x_train_standard)[:, 1]
y_test_proba = lg.predict_proba(x_test_standard)[:, 1]
get_scores(y_train, y_test, y_train_predict, y_test_predict, y_train_proba, y_test_proba)