构建我的第一个机器学习----岩石水雷声呐分类器

本文链接：https://blog.csdn.net/joliph/article/details/77997416

终于要构建机器学习分类器啦，激动，学玩这个才发现，python实在是会让人变懒的，构建分类器竟然不用写线性回归学习过程…..直接用sklearn包就可以了
部分资料参考自：参考网址&书
训练测试数据：数据地址
直接贴代码吧：

# -*- coding:utf-8 -*-
import urllib.request
import numpy
import random
from sklearn import datasets,linear_model
from sklearn.metrics import roc_curve,auc
import pylab as pl

def confusionmatrix(predicted,actual,threshold):
    if len(predicted)!=len(actual):return -1
    tp=0.0
    fp=0.0
    tn=0.0
    fn=0.0
    for i in range(len(actual)):
        if actual[i]>0.5:
            if predicted[i]>threshold:
                tp+=1.0
            else:
                fn+=1.0
        else:
            if predicted[i]<threshold:
                tn+=1.0
            else:
                fp+=1.0
    rtn=[tp,fn,fp,tn]
    return rtn

url='http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data'
data=urllib.request.urlopen(url)

xlist=[]
labels=[]
for line in data:
    row=line.strip().split(b',')
    if(row[-1]==b"M"):#原书中代码是row[-1] == 'M'因为我们上面修改了代码，使row[-1]值变成了b'M',...,b'R'，如果不加以修改，那么下面得到的所有labels值都为零，baidu了好久才发现的
        labels.append(1.0)
    else:
        labels.append(0.0)
    row.pop()
    floatrow=[float(num) for num in row]
    xlist.append(floatrow)

xlisttest=[xlist[i] for i in range(len(xlist)) if i%3 == 0]
xlisttrain=[xlist[i] for i in range(len(xlist)) if i%3 != 0]
labelstest=[labels[i] for i in range(len(xlist)) if i%3 == 0]
labelstrain=[labels[i] for i in range(len(xlist)) if i%3 != 0]

xtrain=numpy.array(xlisttrain)
ytrain=numpy.array(labelstrain)
xtest=numpy.array(xlisttest)
ytest=numpy.array(labelstest)
#print(xtrain)
#print(ytrain)

print("Shape of xTrain array", xtrain.shape)
print("Shape of yTrain array", ytrain.shape)
print("Shape of xTest array", xtest.shape)
print("Shape of yTest array", ytest.shape)

rocksvminesmodel=linear_model.LinearRegression()
rocksvminesmodel.fit(xtrain,ytrain)

trainingpredictions=rocksvminesmodel.predict(xtrain)
#print('the trainingpredictions=', trainingpredictions)

confusionmattrain=confusionmatrix(trainingpredictions,ytrain,0.5)
tp=confusionmattrain[0]
fn=confusionmattrain[1]
fp=confusionmattrain[2]
tn=confusionmattrain[3]

print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')

testpredictions=rocksvminesmodel.predict(xtest)
#print('the trainingpredictions=', trainingpredictions)

conmattest=confusionmatrix(testpredictions,ytest,0.5)
tp=conmattest[0]
fn=conmattest[1]
fp=conmattest[2]
tn=conmattest[3]

print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')
#通过roc_curve()函数，求出fpr和tpr，以及阈值(fp正确率，tp正确率)
fpr,tpr,thresholds=roc_curve(ytrain,trainingpredictions)
roc_auc = auc(fpr, tpr)
print('AUC for in-sample ROC curve: '+str(roc_auc))

pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k-')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('In sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()

fpr,tpr,thresholds=roc_curve(ytest,testpredictions)
roc_auc = auc(fpr, tpr)
print('AUC for out-sample ROC curve: '+str(roc_auc))

pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k-')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('In sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()

测试结果：

Shape of xTrain array (138, 60)
Shape of yTrain array (138,)
Shape of xTest array (70, 60)
Shape of yTest array (70,)
tp = 68.0       fn = 6.0
fp = 7.0        tn = 57.0

tp = 28.0       fn = 9.0
fp = 9.0        tn = 24.0

AUC for in-sample ROC curve: 0.979518581081
AUC for out-sample ROC curve: 0.848484848485