def plot_ks(y_test, y_score, positive_flag):
y_test.index = np.arange(len(y_test))
target_data = pd.DataFrame({'y_test':y_test, 'y_score':y_score})
target_data.sort_values(by = 'y_score', ascending = False, inplace = True)
cuts = np.arange(0.1,1,0.1)
index = len(target_data.y_score)*cuts
scores = target_data.y_score.iloc[index.astype('int')]
Sensitivity = []
Specificity = []
for score in scores:
positive_recall = target_data.loc[(target_data.y_test == positive_flag) & (target_data.y_score>score),:].shape[0]
positive = sum(target_data.y_test == positive_flag)
negative_recall = target_data.loc[(target_data.y_test != positive_flag) & (target_data.y_score<=score),:].shape[0]
negative = sum(target_data.y_test != positive_flag)
Sensitivity.append(positive_recall/positive)
Specificity.append(negative_recall/negative)
plot_data = pd.DataFrame({'cuts':cuts,'y1':1-np.array(Specificity),'y2':np.array(Sensitivity),
'ks':np.array(Sensitivity)-(1-np.array(Specificity))})
max_ks_index = np.argmax(plot_data.ks)
plt.plot([0]+cuts.tolist()+[1], [0]+plot_data.y1.tolist()+[1], label = '1-Specificity')
plt.plot([0]+cuts.tolist()+[1], [0]+plot_data.y2.tolist()+[1], label = 'Sensitivity')
plt.vlines(plot_data.cuts[max_ks_index], ymin = plot_data.y1[max_ks_index],
ymax = plot_data.y2[max_ks_index], linestyles = '--')
plt.text(x = plot_data.cuts[max_ks_index]+0.01,
y = plot_data.y1[max_ks_index]+plot_data.ks[max_ks_index]/2,
s = 'KS= %.2f' %plot_data.ks[max_ks_index])
plt.legend()
plt.show()
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
virtual_data = pd.read_excel(r'virtual_data.xlsx')
plot_ks(y_test = virtual_data.Class, y_score = virtual_data.Score,positive_flag = 'P')
import pandas as pd
import numpy as np
from sklearn import linear_model
from sklearn import model_selection
sports = pd.read_csv(r'Run or Walk.csv')
predictors = sports.columns[4:]
X = sports.ix[:,predictors]
y = sports.activity
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.25, random_state = 1234)
sklearn_logistic = linear_model.LogisticRegression()
sklearn_logistic.fit(X_train, y_train)
print(sklearn_logistic.intercept_, sklearn_logistic.coef_)
sklearn_predict = sklearn_logistic.predict(X_test)
pd.Series(sklearn_predict).value_counts()
from sklearn import metrics
cm = metrics.confusion_matrix(y_test, sklearn_predict, labels = [0,1])
cm
Accuracy = metrics.scorer.accuracy_score(y_test, sklearn_predict)
Sensitivity = metrics.scorer.recall_score(y_test, sklearn_predict)
Specificity = metrics.scorer.recall_score(y_test, sklearn_predict, pos_label=0)
print('模型准确率为%.2f%%:' %(Accuracy*100))
print('正例覆盖率为%.2f%%' %(Sensitivity*100))
print('负例覆盖率为%.2f%%' %(Specificity*100))
import seaborn as sns
import matplotlib.pyplot as plt
sns.heatmap(cm, annot = True, fmt = '.2e',cmap = 'GnBu')
plt.show()
y_score = sklearn_logistic.predict_proba(X_test)[:,1]
fpr,tpr,threshold = metrics.roc_curve(y_test, y_score)
roc_auc = metrics.auc(fpr,tpr)
plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')
plt.plot(fpr, tpr, color='black', lw = 1)
plt.plot([0,1],[0,1], color = 'red', linestyle = '--')
plt.text(0.5,0.3,'ROC curve (area = %0.2f)' % roc_auc)
plt.xlabel('1-Specificity')
plt.ylabel('Sensitivity')
plt.show()
plot_ks(y_test = y_test, y_score = y_score, positive_flag = 1)
import statsmodels.api as sm
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size = 0.25, random_state = 1234)
X_train2 = sm.add_constant(X_train)
X_test2 = sm.add_constant(X_test)
sm_logistic = sm.formula.Logit(y_train, X_train2).fit()
sm_logistic.params
sm_y_probability = sm_logistic.predict(X_test2)
sm_pred_y = np.where(sm_y_probability >= 0.5, 1, 0)
cm = metrics.confusion_matrix(y_test, sm_pred_y, labels = [0,1])
cm
fpr,tpr,threshold = metrics.roc_curve(y_test, sm_y_probability)
roc_auc = metrics.auc(fpr,tpr)
plt.stackplot(fpr, tpr, color='steelblue', alpha = 0.5, edgecolor = 'black')
plt.plot(fpr, tpr, color='black', lw = 1)
plt.plot([0,1],[0,1], color = 'red', linestyle = '--')
plt.text(0.5,0.3,'ROC curve (area = %0.2f)' % roc_auc)
plt.xlabel('1-Specificity')
plt.ylabel('Sensitivity')
plt.show()
sm_y_probability.index = np.arange(len(sm_y_probability))
plot_ks(y_test = y_test, y_score = sm_y_probability, positive_flag = 1)