终于要构建机器学习分类器啦,激动,学玩这个才发现,python实在是会让人变懒的,构建分类器竟然不用写线性回归学习过程…..直接用sklearn包就可以了
部分资料参考自:参考网址&书
训练测试数据:数据地址
直接贴代码吧:
# -*- coding:utf-8 -*-
import urllib.request
import numpy
import random
from sklearn import datasets,linear_model
from sklearn.metrics import roc_curve,auc
import pylab as pl
def confusionmatrix(predicted,actual,threshold):
if len(predicted)!=len(actual):return -1
tp=0.0
fp=0.0
tn=0.0
fn=0.0
for i in range(len(actual)):
if actual[i]>0.5:
if predicted[i]>threshold:
tp+=1.0
else:
fn+=1.0
else:
if predicted[i]<threshold:
tn+=1.0
else:
fp+=1.0
rtn=[tp,fn,fp,tn]
return rtn
url='http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data'
data=urllib.request.urlopen(url)
xlist=[]
labels=[]
for line in data:
row=line.strip().split(b',')
if(row[-1]==b"M"):#原书中代码是row[-1] == 'M'因为我们上面修改了代码,使row[-1]值变成了b'M',...,b'R',如果不加以修改,那么下面得到的所有labels值都为零,baidu了好久才发现的
labels.append(1.0)
else:
labels.append(0.0)
row.pop()
floatrow=[float(num) for num in row]
xlist.append(floatrow)
xlisttest=[xlist[i] for i in range(len(xlist)) if i%3 == 0]
xlisttrain=[xlist[i] for i in range(len(xlist)) if i%3 != 0]
labelstest=[labels[i] for i in range(len(xlist)) if i%3 == 0]
labelstrain=[labels[i] for i in range(len(xlist)) if i%3 != 0]
xtrain=numpy.array(xlisttrain)
ytrain=numpy.array(labelstrain)
xtest=numpy.array(xlisttest)
ytest=numpy.array(labelstest)
#print(xtrain)
#print(ytrain)
print("Shape of xTrain array", xtrain.shape)
print("Shape of yTrain array", ytrain.shape)
print("Shape of xTest array", xtest.shape)
print("Shape of yTest array", ytest.shape)
rocksvminesmodel=linear_model.LinearRegression()
rocksvminesmodel.fit(xtrain,ytrain)
trainingpredictions=rocksvminesmodel.predict(xtrain)
#print('the trainingpredictions=', trainingpredictions)
confusionmattrain=confusionmatrix(trainingpredictions,ytrain,0.5)
tp=confusionmattrain[0]
fn=confusionmattrain[1]
fp=confusionmattrain[2]
tn=confusionmattrain[3]
print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')
testpredictions=rocksvminesmodel.predict(xtest)
#print('the trainingpredictions=', trainingpredictions)
conmattest=confusionmatrix(testpredictions,ytest,0.5)
tp=conmattest[0]
fn=conmattest[1]
fp=conmattest[2]
tn=conmattest[3]
print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')
#通过roc_curve()函数,求出fpr和tpr,以及阈值(fp正确率,tp正确率)
fpr,tpr,thresholds=roc_curve(ytrain,trainingpredictions)
roc_auc = auc(fpr, tpr)
print('AUC for in-sample ROC curve: '+str(roc_auc))
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k-')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('In sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()
fpr,tpr,thresholds=roc_curve(ytest,testpredictions)
roc_auc = auc(fpr, tpr)
print('AUC for out-sample ROC curve: '+str(roc_auc))
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k-')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('In sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()
测试结果:
Shape of xTrain array (138, 60)
Shape of yTrain array (138,)
Shape of xTest array (70, 60)
Shape of yTest array (70,)
tp = 68.0 fn = 6.0
fp = 7.0 tn = 57.0
tp = 28.0 fn = 9.0
fp = 9.0 tn = 24.0
AUC for in-sample ROC curve: 0.979518581081
AUC for out-sample ROC curve: 0.848484848485
可以看到训练结果还是不错的