from sklearn import datasets, linear_model
from sklearn.metrics import roc_curve, auc
import pylab as pl
def confusionMatrix(predicted, actual, threshold):
if len(predicted) != len(actual):
return -1
tp = 0.0#true-postition 真正
fp = 0.0
tn = 0.0
fn = 0.0
for i in range(len(actual)):
if actual[i] > 0.5:
if predicted[i] > threshold:
tp += 1.0
else:
fn += 1.0
else:
if predicted[i] < threshold:
tn += 1.0
else:
fp += 1.0
rtn = [tp, fn, fp, tn]
return rtn
target_url = ("http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data")#书上的网址是https://我在网上输入带s的
#网址网也会打不开,去掉s就可以了,不知道其他同学有没有这种问题
data = urllib.request.urlopen(target_url)
xList = []
labels = []
for line in data:
row = line.strip().split(",".encode(encoding='utf-8'))
if(row[-1] == b'M'): #原书中代码是row[-1] == 'M'因为我们上面修改了代码,使row[-1]值变成了b'M',...,b'R',如果不加以修改,那么下面得到的所有labels值都为零,调试了好久才发现的
labels.append(1.0)
else:
labels.append(0.0)
row.pop()
floatRow = [float(num) for num in row]
xList.append(floatRow)
#print(labels) 调试所用可忽略
#print(len(labels)) 调试所用
indices = range(len(xList))
xListTest = [xList[i] for i in indices if i%3 == 0]
xListTrain = [xList[i] for i in indices if i%3 != 0]
labelsTest = [labels[i] for i in indices if i%3 == 0]
labelsTrain = [labels[i] for i in indices if i%3 != 0]
xTrain = numpy.array(xListTrain)
yTrain = numpy.array(labelsTrain)
xTest = numpy.array(xListTest)
yTest = numpy.array(labelsTest)
#print('yTrain = ', yTrain)
#print('yTest = ', yTest)
print("Shape of xTrain array", xTrain.shape)
print("Shape of yTrain array", yTrain.shape)
print("Shape of xTest array", xTest.shape)
print("Shape of yTest array", yTest.shape)
rocksVMinesModel = linear_model.LinearRegression()
rocksVMinesModel.fit(xTrain, yTrain)
trainingPredictions = rocksVMinesModel.predict(xTrain)
#print('trainingPredictions = ', trainingPredictions)
print("Some values predicted by model", trainingPredictions[0:5], trainingPredictions[-5:-1])
confusionMatTrain = confusionMatrix(trainingPredictions, yTrain, 0.5)
tp = confusionMatTrain[0]
fn = confusionMatTrain[1]
fp = confusionMatTrain[2]
tn = confusionMatTrain[3]
print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')
testPredictions = rocksVMinesModel.predict(xTest)
conMatTest = confusionMatrix(testPredictions, yTest, 0)
tp = conMatTest[0]
fn = conMatTest[1]
fp = conMatTest[2]
tn = conMatTest[3]
print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')
fpr, tpr, thresholds = roc_curve(yTrain, trainingPredictions)
roc_auc = auc(fpr, tpr)
print('AUC for in-sample ROC curve: %f' % roc_auc)
#plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k-')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('In sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()
fpr, tpr, thresholds = roc_curve(yTest, testPredictions)
roc_ayc = auc(fpr, tpr)
print('AUC for out-of-sample ROC curve: %f' % roc_auc)
#plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k-')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Out-of-sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()
由于我的是python3版本,如果按照源代码row = line.strip().split(",")运行程序就会
出现如下问题,只需要在源代码上相应的修改row = line.strip().split(",".encode(encoding='utf-8')),就是需要把原字符串类型转换成bytes类型
#-------------------------------------------------------------------------------------
Traceback (most recent call last):
File "D:\Python362\a_机器学习及实战\3_2.py", line 38, in <module>
row = line.strip().split(",")
TypeError: a bytes-like object is required, not 'str'
#--------------------------------------------------------------------------------------
修改后得到的row[-1]是这样的,那么相应的也要修改接下来的程序
----------------------------------------------------------------------------------------------
b'R'
b'R'
b'R'
b'R'
b'R'
b'R'
b'R'
...
-------------------------------------------------------------------------------------------------
'''
紧接着下面的源代码if(row[-1] == 'M'):也要做相应的修改:if(row[-1] == b'M'):,若不修改,那么接下来的到的labels值全部是零,运行代码虽然不会出错,但是预测效果也都是毫无意义的了
修改两处代码之后运行的到的结果是:
Shape of xTrain array (138, 60)
Shape of yTrain array (138,)
Shape of xTest array (70, 60)
Shape of yTest array (70,)
Some values predicted by model [-0.10240253 0.42090698 0.38593034 0.36094537 0.31520494] [ 1.12242751 0.77626699 1.02016858 0.66338081]
tp = 68.0 fn = 6.0
fp = 7.0 tn = 57.0
tp = 36.0 fn = 1.0
fp = 19.0 tn = 14.0
AUC for in-sample ROC curve: 0.979519
AUC for out-of-sample ROC curve: 0.979519
>>>