python机器学习-预测分析核心算法3-2代码在python3下运行遇到的一些问题

最新推荐文章于 2023-08-17 15:41:52 发布

llx1026

最新推荐文章于 2023-08-17 15:41:52 发布

阅读量1.6k

点赞数 1

分类专栏： python机器学习-预测分析核心算法文章标签： python 机器学习

本文链接：https://blog.csdn.net/llx1026/article/details/77770181

版权

python机器学习-预测分析核心算法专栏收录该内容

10 篇文章 1 订阅

订阅专栏

from sklearn import datasets, linear_model
from sklearn.metrics import roc_curve, auc
import pylab as pl

def confusionMatrix(predicted, actual, threshold):
    if len(predicted) != len(actual):
        return -1
    tp = 0.0#true-postition 真正
    fp = 0.0
    tn = 0.0
    fn = 0.0

    for i in range(len(actual)):
        if actual[i] > 0.5:
            if predicted[i] > threshold: 
                tp += 1.0
            else:
                fn += 1.0
        else:
            if predicted[i] < threshold:
                tn += 1.0
            else:
                fp += 1.0
    rtn = [tp, fn, fp, tn]
    return rtn

target_url = ("http://archive.ics.uci.edu/ml/machine-learning-databases/undocumented/connectionist-bench/sonar/sonar.all-data")#书上的网址是https://我在网上输入带s的
#网址网也会打不开，去掉s就可以了，不知道其他同学有没有这种问题
data = urllib.request.urlopen(target_url)

xList = []
labels = []
for line in data:
    row = line.strip().split(",".encode(encoding='utf-8'))

    if(row[-1] == b'M'): #原书中代码是row[-1] == 'M'因为我们上面修改了代码，使row[-1]值变成了b'M',...,b'R'，如果不加以修改，那么下面得到的所有labels值都为零，调试了好久才发现的
        labels.append(1.0)
    else:
        labels.append(0.0)
    row.pop()
    floatRow = [float(num) for num in row]
    xList.append(floatRow)
#print(labels)       调试所用可忽略
#print(len(labels))  调试所用

indices = range(len(xList))
xListTest = [xList[i] for i in indices if i%3 == 0]
xListTrain = [xList[i] for i in indices if i%3 != 0]
labelsTest = [labels[i] for i in indices if i%3 == 0]
labelsTrain = [labels[i] for i in indices if i%3 != 0]


xTrain = numpy.array(xListTrain)
yTrain = numpy.array(labelsTrain)
xTest = numpy.array(xListTest)
yTest = numpy.array(labelsTest)
#print('yTrain = ', yTrain)
#print('yTest = ', yTest)

print("Shape of xTrain array", xTrain.shape)
print("Shape of yTrain array", yTrain.shape)
print("Shape of xTest array", xTest.shape)
print("Shape of yTest array", yTest.shape)

rocksVMinesModel = linear_model.LinearRegression()
rocksVMinesModel.fit(xTrain, yTrain)

trainingPredictions = rocksVMinesModel.predict(xTrain)
#print('trainingPredictions = ', trainingPredictions)
print("Some values predicted by model", trainingPredictions[0:5], trainingPredictions[-5:-1])

confusionMatTrain = confusionMatrix(trainingPredictions, yTrain, 0.5)
tp = confusionMatTrain[0]
fn = confusionMatTrain[1]
fp = confusionMatTrain[2]
tn = confusionMatTrain[3]

print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')

testPredictions = rocksVMinesModel.predict(xTest)

conMatTest = confusionMatrix(testPredictions, yTest, 0)
tp = conMatTest[0]
fn = conMatTest[1]
fp = conMatTest[2]
tn = conMatTest[3]
print("tp = " + str(tp) + "\tfn = " + str(fn) + "\n" + "fp = " + str(fp) + "\ttn = " + str(tn) + '\n')

fpr, tpr, thresholds = roc_curve(yTrain, trainingPredictions)
roc_auc = auc(fpr, tpr)
print('AUC for in-sample ROC curve: %f' % roc_auc)

#plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k-')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('In sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()

fpr, tpr, thresholds = roc_curve(yTest, testPredictions)
roc_ayc = auc(fpr, tpr)
print('AUC for out-of-sample ROC curve: %f' % roc_auc)

#plot ROC curve
pl.clf()
pl.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
pl.plot([0, 1], [0, 1], 'k-')
pl.xlim([0.0, 1.0])
pl.ylim([0.0, 1.0])
pl.xlabel('False Positive Rate')
pl.ylabel('True Positive Rate')
pl.title('Out-of-sample ROC rocks versus mines')
pl.legend(loc="lower right")
pl.show()

由于我的是python3版本，如果按照源代码row = line.strip().split(",")运行程序就会
出现如下问题，只需要在源代码上相应的修改row = line.strip().split(",".encode(encoding='utf-8'))，就是需要把原字符串类型转换成bytes类型
#-------------------------------------------------------------------------------------
Traceback (most recent call last):
File "D:\Python362\a_机器学习及实战\3_2.py", line 38, in <module>
row = line.strip().split(",")
TypeError: a bytes-like object is required, not 'str'
#--------------------------------------------------------------------------------------
修改后得到的row[-1]是这样的，那么相应的也要修改接下来的程序
----------------------------------------------------------------------------------------------
b'R'
b'R'
b'R'
b'R'
b'R'
b'R'
b'R'
...
-------------------------------------------------------------------------------------------------

'''

紧接着下面的源代码if(row[-1] == 'M'):也要做相应的修改：if(row[-1] == b'M'):，若不修改，那么接下来的到的labels值全部是零，运行代码虽然不会出错，但是预测效果也都是毫无意义的了

修改两处代码之后运行的到的结果是：

Shape of xTrain array (138, 60)
Shape of yTrain array (138,)
Shape of xTest array (70, 60)
Shape of yTest array (70,)
Some values predicted by model [-0.10240253  0.42090698  0.38593034  0.36094537  0.31520494] [ 1.12242751  0.77626699  1.02016858  0.66338081]
tp = 68.0	fn = 6.0
fp = 7.0	tn = 57.0

tp = 36.0	fn = 1.0
fp = 19.0	tn = 14.0

AUC for in-sample ROC curve: 0.979519
AUC for out-of-sample ROC curve: 0.979519
>>>