数据集中 : 正例 反例
你的预测 正例 : A B
你的预测 反例 : C D
准确率就是A/(A+B) 大白话就是“你的预测有多少是对的”
召回率就是A/(A+C) 大白话就是“正例里你的预测覆盖了多少”
"""
1.5 50 thin
1.5 60 fat
1.6 40 thin
1.6 60 fat
1.7 60 thin
1.7 80 fat
1.8 60 thin
1.8 90 fat
1.9 70 thin
1.9 80 fat
"""
#coding=utf-8
import re
import numpy as np
#import scipy as sp
from sklearn import tree
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import classification_report
from sklearn.cross_validation import train_test_split
data = []
labels = []
f = open("train.txt", 'r')
for line in f.readlines()[1:]:##有时候打开记事本会出现bom标记,因此忽略第一行
now = line.rstrip("\n").split(' ')
data.append([float(i) for i in now[:-1]])
labels.append(now[-1])
f.close()
x = np.array(data)
labels = np.array(labels)
y = np.zeros(labels.shape)
##标签转换numpy,相当于labels=fat对应位置的y设置为1
y[labels=='fat'] = 1
##拆分训练数据与测试数据 train0.8 test0.2
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)
clf = tree.DecisionTreeClassifier(criterion="entropy")#利用决策树 进行熵判决
clf.fit(x_train,y_train)
with open("tree.dot","w") as f:
f = tree.export_graphviz(clf,out_file = f)
print clf.feature_importances_
ans = clf.predict(x_train)
print np.mean(ans == y_train)#可以看出对于train数据来说 准确率可以达到100%
##准确率(正确的判为正确的,正确的没有被判为正确的)
precision,recall,thresholds = precision_recall_curve(y_train,clf.predict(x_train))
print precision,recall,thresholds
answer = clf.predict_proba(x)[:,1]
print classification_report(y,answer,target_names=["thin","fat"])