读取数据
import pandas as pd
df = pd.read_csv('白名单.csv')
查看数据维度
df.shape
统计Y轴(数据集好坏样本比例)
df['overdue'].value_counts()
Y轴柱形图
df['overdue'].value_counts()
数据集划分
Y = df.overdue.values #Y
X = df.drop(['overdue','order_id'],axis=1).values #X
#训练集和测试集比例为9:1
n_train = int(len(df)*0.9)
#训练集
X_train = X[:n_train]
Y_train = Y[:n_train]
#测试集
X_test = X[n_train:]
Y_test = Y[n_train:]
onehot编码
from sklearn.preprocessing import OneHotEncoder
#默认如果出现缺失值就会报错
#handle_unknown='ignore'就不会报错
enc = OneHotEncoder(handle_unknown='ignore')
X_train_enc = enc.fit_transform(X_train)
X_train_enc
X_test_enc = enc.transform(X_test)
训练模型
from sklearn.tree import DecisionTreeClassifier#决策树DecisionTreeClassifier
from sklearn.metrics import roc_auc_score
#数据集比较小默认即可,
dtree = DecisionTreeClassifier()
#数据比较大
dtree = DecisionTreeClassifier(min_samples_splt=30)
dtree.fit(X_train_enc,Y_train)
preds = dtree.predict_proba(X_test_enc)[:,1]
roc_auc_score(Y_test,preds)#模型效果
测试模型
import numpy as np
click_index=np.random.choice(len(guess),int(len(guess)*871/935),replace=False)
guess[click_index] = 1
roc_auc_score(Y_test,guess)