文章目录
数据集准备
from sklearn.datasets import load_iris
iris=load_iris()
X=iris.data
y=iris.target
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
lr.fit(iris.data,iris.target)
yfit=lr.predict(X)
一.准确率得分accuracy_score
#sum(y==yfit)/len(y)
from sklearn.metrics import accuracy_score
accuracy_score(y,yfit)
>>>0.96
类别数据不平衡的情况下,准确率得分毫无意义 需要有方法可分解并计算由分类器产生不同类型的正误情况
二.混淆矩阵 confusion_matrix
建立混淆矩阵
from sklearn.metrics import confusion_matrix
mat=confusion_matrix(y,yfit)
mat
>>>
array([[50, 0, 0],
[ 0, 45, 5],
[ 0, 1, 49]], dtype=int64)
绘制热力图
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.heatmap(mat, square=True, annot=True, cbar=False,cmap='binary')
plt.xlabel('predicted value')
plt.ylabel('true value')
三.准确率Precision和召回率Recall
准确率Precision:代表所有阳性样本中,得以正确检测出阳性结果的几率,TP/(TP+FP)
召回率Recall:代表所有想抓出来的样本中,得以正确检测出阳性结果的几率,TP/(TP+FN)
F1:两者的调和平均数 2PrecisionRecall/(Precision+Recall)
#准确率和召回率
from sklearn.metrics import classification_report
print(classification_report(y,yfit))
四.交叉验证
训练集和测试集是同一个,可能会导致过度适配 拆分数据集成训练集和测试集
1) holdout验证
随机从最初的样本中选取部分(一般大于2/3)做训练数据,剩余的做验证数据。
#拆分数据集
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y=train_test_split(X,y,test_size=0.33,random_state=123)
#决策树分类
from sklearn.tree import DecisionTreeClassifier
dtc=DecisionTreeClassifier()
dtc.fit(train_X,train_y)
#验证准确性
from sklearn.metrics import accuracy_score
accuracy_score(test_y,fit_y)
>>>0.94
#建立混淆矩阵
from sklearn.metrics import confusion_matrix
mat=confusion_matrix(test_y,fit_y)
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
sns.heatmap(mat, square=True, annot=True, cbar=False,cmap='binary')
2)kfold交叉验证分析
K次交叉验证,初始样本被分割成K个子集,k-1个子集用作训练,1个子集用作验证。轮换K次,每个子集都做过一次验证集体。
a.逐步kfold验证
#Kfold分割子集
from sklearn.model_selection import KFold
kf=KFold(n_splits=10)#数据集拆成10份(0-14,15-19...)
acc=[]
for train,test in kf.split(X):
train_X,test_X,train_y,test_y=X[train],X[test],y[train],y[test]
dtc=DecisionTreeClassifier()
dtc.fit(train_X,train_y)
fit_y=dtc.predict(test_X)
print(accuracy_score(test_y,fit_y))
acc.append(accuracy_score(test_y,fit_y))
计算平均分
import numpy as np
np.mean(acc)
>>>0.9400000000000001
b.直接获得交叉验证分数组
dtc=DecisionTreeClassifier()#初始化的模型
from sklearn.model_selection import cross_val_score
acc=cross_val_score(dtc,X,y,cv=10)#自动分割子集
acc.mean()
>>>0.96
acc
>>>
array([1. , 0.93333333, 1. , 0.93333333, 0.93333333,
0.86666667, 0.93333333, 0.93333333, 1. , 1. ])
3)留一验证
留取样本中的一项来做验证集,其余都用作训练集,每一项都作一次验证集。
from sklearn.model_selection import LeaveOneOut
loo=LeaveOneOut()#149份训练数据,1份测试数据
acc=[]
for train,test in loo.split(X):
train_X,test_X,train_y,test_y=X[train],X[test],y[train],y[test]
dtc=DecisionTreeClassifier()
dtc.fit(train_X,train_y)
fit_y=dtc.predict(test_X)
print(accuracy_score(test_y,fit_y))
acc.append(accuracy_score(test_y,fit_y))
计算平均分
import numpy as np
np.mean(acc)
>>>0.9533333333333334
五.ROC曲线和AUC得分
接受者操作特征
X轴:假阳性率(FPR),(1-特异性)
Y轴:真阳性率(TPR),敏感性
1)数据准备
X=iris.data[50:150,]
y=iris.target[50:150]
#编码分类特征(转换成0开始的数字编码)
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
yft=le.fit_transform(y)
yft
2)建立模型
#建立预测模型-决策树
from sklearn.model_selection import train_test_split
train_X,test_X,train_y,test_y=train_test_split(X,yft,test_size=0.33,random_state=123)
dtc=DecisionTreeClassifier()
dtc.fit(train_X,train_y)
yproba=dtc.predict_proba(test_X)#预测属于不同标签的概率
yproba#第一列是该样本属于标签0的概率,第二列是该样本属于标签1的概率
yproba[:,1]#该样本属于标签1的概率,0=0%属于1标签=100%属于0标签,1=是100%属于1标签
>>>array([0., 1., 1., 0., 1., 0., 0., 1., 1., 0., 0., 1., 0., 1., 0., 0., 1.,
0., 0., 1., 0., 0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 1., 1.])
test_y#0=0标签,1=1标签
>>>array([0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1,
0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0], dtype=int64)
3)ROC曲线
#roc曲线
#计算x\y轴数组
from sklearn.metrics import roc_curve
fpr,tpr,thresholds=roc_curve(test_y,yproba[:,1])
#绘图
plt.plot(fpr,tpr,label='ROC curve')
plt.plot([0,1],[0,1],'k--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc='lower right')
4)AUC
越接近1,代表此筛选方式越好
from sklearn.metrics import auc
roc_auc=auc(fpr,tpr)
roc_auc
>>>0.8768382352941176
5)不同模型的auc值比较
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
dtc=DecisionTreeClassifier()
dtc.fit(train_X,train_y)
svc=SVC(probability=True)
svc.fit(train_X,train_y)
lg=LogisticRegression()
lg.fit(train_X,train_y)
lfc= RandomForestClassifier()
lfc.fit(train_X,train_y)
for model,title in zip([dtc,svc,lg,lfc],['DecisionTreeClassifier','SVC','LogisticRegression','RandomForestClassifier']):
for model,title in zip([dtc,svc,lg,lfc],['DecisionTreeClassifier','SVC','LogisticRegression','RandomForestClassifier']):
yproba=model.predict_proba(test_X)
fpr,tpr,thresholds=roc_curve(test_y,yproba[:,1])
roc_auc=auc(fpr,tpr)
plt.plot(fpr,tpr,label='%s:%.2f'%(title, roc_auc))
plt.plot([0,1],[0,1],'k--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc='lower right')
6.案例-客户流失预测模型的评估
之前用分类模型预测过用户流失,详见,但是并没有对各模型进行很好地验证。
1)准备数据
import pandas as pd
df= pd.read_csv('E:/Jupyter workspace/python_for_data_science/Data/customer_churn.csv', index_col=0, header = 0)
#剔除与流失无关的特征
df=df.iloc[:,3:]
#yes-no准换成1-0
collist=['international_plan','voice_mail_plan','churn']
for var in collist:
df[var]=df[var].apply(lambda x: 1 if x=='yes' else 0)
X=df.iloc[:,:-1]
y=df.iloc[:,-1]
2)交叉验证-拆分数据
from sklearn.model_selection import train_test_split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size = 0.33, random_state = 123)
3)数据建模
#1.决策树
from sklearn import tree
dtc=tree.DecisionTreeClassifier(max_depth=3)#设置分层
dtc.fit(train_X,train_y)
#2.SVC
from sklearn.svm import SVC
svc=SVC(probability=True)
svc.fit(train_X,train_y)
#3.逻辑回归
from sklearn.linear_model import LogisticRegression
lg=LogisticRegression()
lg.fit(train_X,train_y)
#4.随机森林
from sklearn.ensemble import RandomForestClassifier
rfc= RandomForestClassifier()
rfc.fit(train_X,train_y)
4)绘制ROC曲线和计算auc
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve
from sklearn.metrics import auc
for model,title in zip([dtc,svc,lg,rfc],['DecisionTreeClassifier','SVC','LogisticRegression','RandomForestClassifier']):
yproba=model.predict_proba(test_X)
fpr,tpr,thresholds=roc_curve(test_y,yproba[:,1])
roc_auc=auc(fpr,tpr)
plt.plot(fpr,tpr,label='%s:%.2f'%(title, roc_auc))
plt.plot([0,1],[0,1],'k--')
plt.xlim([0.0,1.0])
plt.ylim([0.0,1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC')
plt.legend(loc='lower right')
5)模型验证-评估指标
ypredicted=rfc.predict(test_X)
#准确率得分
from sklearn.metrics import accuracy_score
accuracy_score(test_y, ypredicted)
>>>0.9463636363636364
#混淆矩阵
from sklearn.metrics import confusion_matrix
mat = confusion_matrix(test_y, ypredicted)
mat
>>>array([[936, 14],
[ 45, 105]], dtype=int64)
import seaborn as sns
sns.heatmap(mat, square=True, annot=True, fmt='.20g',cbar=False,cmap='binary')#fmt设置数字格式,不适用科学计数
plt.xlabel('predicted value')
plt.ylabel('true value')
#准确率和召回率和f1得分
from sklearn.metrics import classification_report
print(classification_report(test_y, ypredicted))
6)评估特征重要性排名
#模型特征的重要性评分
importances=rfc.feature_importances_
importances
index=importances.argsort()[::-1]#逆序后从大到小
index
features=train_X.columns
features
features[index]
#绘图
plt.figure(figsize=(10,5))
plt.title('Feature Importance')
plt.bar(range(0, len(features)), importances[index])
plt.xticks(range(0,len(features)), features[index], rotation=90)
plt.show()