ks的理解:就是根据不同的阈值计算好坏样本的召回率(概率分布),取好坏样本召回率差最大的就是ks
1、numpy
def calc_ks_np(y_true,y_pred):
y_true = y_true.reshape(-1,)
y_pred = y_pred.reshape(-1,)
sort_index = np.argsort(y_pred, kind="mergesort")[::-1]
y_pred = y_pred[sort_index]
y_true = y_true[sort_index]
# 获取不同的值
diff = np.diff(y_pred)
distinct_value_indices = np.where(np.diff(y_pred))[0]
threshold_idxs = np.r_[distinct_value_indices, y_pred.size - 1]
tps = np.cumsum(y_true)[threshold_idxs]
fps = np.cumsum((1 - y_true))[threshold_idxs]
threshold = y_pred[threshold_idxs]
tps = np.r_[0, tps]
fps = np.r_[0, fps]
tpr = tps / (tps[-1] + 1e-32)
fpr = fps / (fps[-1] + 1e-32)
return max(tpr-fpr)
2、tensorflow
def calc_ks_tf(y_true,y_pred):
length = tf.shape(y_true)[0] - 1
y_true = tf.reshape(y_true,shape=[-1,])
y_pred = tf.reshape(y_pred,shape=[-1,])
# sorted
idx = tf.argsort(y_pred,direction='DESCENDING',axis=0)
y_pred = tf.gather(y_pred, idx)
y_true = tf.gather(y_true, idx)
diff = y_pred[1:] - y_pred[:-1]
distinct_value_indices = tf.where(diff)
size = tf.cast([[length]],tf.int64)
threshold_idxs = tf.concat([distinct_value_indices,size],0)
threshold_idxs = tf.reshape(threshold_idxs,shape=[-1,])
tps = tf.gather(tf.cumsum(y_true),threshold_idxs)
fps = tf.gather(tf.cumsum(1-y_true),threshold_idxs)
tps = tf.cast(tps,tf.float32)
fps = tf.cast(fps,tf.float32)
tpr = tps / (tps[-1]+tf.constant(1e-32))
fpr = fps / (fps[-1]+tf.constant(1e-32))
return tf.reduce_max(tpr - fpr)