一、前言
因为毕业论文要使用传统目标检测方法与深度学习方法进行对比,故接触到HOG+SVM,为了输出mAP,踩了不少坑,记录一下纪念踩过的坑!
二、踩坑大盘点
感谢这位博主的代码拯救了我https://arleyzhang.github.io/articles/c521a01c/,里面的讲解很清晰,从如何划分数据集,到使用SVM,再到输出mAP结果图。棒呆。
原文使用的是鸢尾花数据集,直接导入数据集获取数据和标签即可使用。若使用我们自己的数据集,需要先分好正负样本,然后提取相应的正负样本特征,最后使用以下代码获得数据和标签:
pos_feat_path = '../data/features/pos'
neg_feat_path = '../data/features/neg'
# Classifiers supported
clf_type = 'LIN_SVM'
fds = []
labels = []
# fds = np.array(fds).reshape(1, -1)
print(len(fds), len(labels))
# Load the positive features
for feat_path in glob.glob(os.path.join(pos_feat_path,"*.feat")):
fd = joblib.load(feat_path)
fds.append(fd)
labels.append(1)
print(len(fds), len(labels))
# Load the negative features
for feat_path in glob.glob(os.path.join(neg_feat_path,"*.feat")):
fd = joblib.load(feat_path)
fds.append(fd)
labels.append(0)
print(np.array(fds).shape, len(labels))
print(len(fds), len(labels))
1.划分好数据集之后直接运行报错如下:
Stack Overflow一通之后,感觉是数据集划分的有问题,链接里面只采用了鸢尾花数据集的前两类。相关代码如下:
X_train, X_test, y_train, y_test = train_test_split(X[y < 2], y[y < 2],
test_size=.5,
random_state=random_state)
而我的数据集只有一类,这样表示肯定就有问题了,直接删掉 [y < 2] 就好啦,如下:
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=.5,
random_state=random_state)
2.上面划分数据集的错误解决了,又来一个问题,报错如下:
报错提示也很明显:只有包含一个元素的整数数组才能转化为索引。且错误定位在y_true = y_true[desc_score_indices]
。打印 y_true,发现它是简单的python列表形式:[0,1,0,0,1,1,0,1],我们进行如下的操作,会发现报类似的错
这是因为y_true只是一个普通的python列表,因此不能使用另一个列表对其进行索引(除非另一个列表是具有单个元素的数组),所以在y_true = y_true[desc_score_indices]
前面加上y_true = np.array(y_true)
即可,如下所示:
y_true = np.array(y_true)
y_true = y_true[desc_score_indices]
运行成功的结果如下图所示:
三、代码
附上我融合之后的代码:
# coding=utf-8
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
import glob
import os
from config import *
def preprocess_iris_data():
pos_feat_path = '../data/features/pos'
neg_feat_path = '../data/features/neg'
# Classifiers supported
clf_type = 'LIN_SVM'
fds = []
labels = []
# fds = np.array(fds).reshape(1, -1)
print(len(fds), len(labels))
# Load the positive features
for feat_path in glob.glob(os.path.join(pos_feat_path,"*.feat")):
fd = joblib.load(feat_path)
fds.append(fd)
labels.append(1)
print(len(fds), len(labels))
# Load the negative features
for feat_path in glob.glob(os.path.join(neg_feat_path,"*.feat")):
fd = joblib.load(feat_path)
fds.append(fd)
labels.append(0)
print(np.array(fds).shape, len(labels))
print(len(fds), len(labels))
# iris = datasets.load_iris()
X = fds
y = labels
# Add noisy features
random_state = np.random.RandomState(0)
# n_samples, n_features = X.shape
# X = np.c_[X, random_state.randn(n_samples, 200 * n_features)]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=.5,
random_state=random_state)
# Create a simple classifier
if clf_type is "LIN_SVM":
clf = LinearSVC()
print("Training a Linear SVM Classifier")
# clf.fit(fds, labels)
clf.fit(X_train, y_train)
y_score = clf.decision_function(X_test)
# If feature directories don't exist, create them
if not os.path.isdir(os.path.split(model_path)[0]):
os.makedirs(os.path.split(model_path)[0])
save_model_path = os.path.join(model_path, 'svm.model')
joblib.dump(clf, save_model_path)
print("Classifier saved to {}".format(model_path))
return y_test, y_score
def precision_recall_curve(y_true, y_score, pos_label=None):
if pos_label is None:
pos_label = 1
# 不同的排序方式,其结果也会有略微差别,
# 比如 kind="mergesort" 的结果跟kind="quicksort"的结果是不同的,
# 这是因为归并排序是稳定的,快速排序是不稳定的,sklearn中使用的是 kind="mergesort"
desc_score_indices = np.argsort(y_score, kind="quicksort")[::-1]
y_score = y_score[desc_score_indices]
y_true = np.array(y_true)
y_true = y_true[desc_score_indices]
# 确定阈值下标索引,score中可能会有重复的分数,在sklearn中,重复的元素被去掉了
# 本来以为去除重复分数会影响结果呢,因为如果两个样本有重复的分数,一个标签是1,一个是0,
# 感觉去掉哪个都会影响结果啊,但是实验发现竟然不影响结果,有点纳闷,以后有时间再分析吧
# distinct_value_indices = np.where(np.diff(y_score))[0]
# threshold_idxs = np.r_[distinct_value_indices, y_true.size - 1]
# 这里没有去除重复分数
threshold_idxs = np.arange(y_score.size)
# 按照阈值依次降低的顺序,确定当前阈值下的true positives 个数,tps[-1]对应于所有的正例数量
tps = np.cumsum(y_true * 1.)[threshold_idxs]
# 计算当前阈值下的 false positives 个数,
# 它与 tps的关系为fps=1+threshold_idxs-tps,这个关系是比较明显的
fps = 1 + threshold_idxs - tps
# y_score[threshold_idxs]把对应下标的阈值取出
thresholds = y_score[threshold_idxs]
precision = tps / (tps + fps)
recall = tps / tps[-1]
# 这里与sklearn有略微不同,即样本点全部输出,令last_ind = tps.size,即可
last_ind = tps.size
sl = slice(0, last_ind)
return np.r_[1, precision[sl]], np.r_[0, recall[sl]], thresholds[sl]
def average_precision_approximated(y_true, y_predict):
"""
计算approximated形式的ap,每个样本点的分数都是recall的一个cutoff
:param y_true: 标签
:param y_predict: 实际预测得分
:return: precision,recall,threshold,average precision
"""
precision, recall, thresholds = precision_recall_curve(
y_true, y_predict, pos_label=1)
average_precision = np.sum(np.diff(recall) * np.array(precision)[1:])
return precision, recall, thresholds, average_precision
def average_precision_interpolated(y_true, y_predict):
"""
计算interpolated形式的ap,每个正样本对应的分数都是recalll的一个cutoff
:param y_true: 标签
:param y_predict: 实际预测得分
:return: precision,recall,threshold,average precision
"""
precision, recall, thresholds = precision_recall_curve(
y_true, y_predict, pos_label=1)
# 获取recall轴上的分割,np.insert(recall, 0 , -1, axis=0)是为了保证获取到重复recall的第一个索引值
# 因为重复的recall中只有对应的第一个precision是最大的,我们只需要获取这个最大的precision
# 或者说每遇到一个正样本,需要将其对应的recall值作为横轴的切分
recall_cutoff_index = np.where(
np.diff(np.insert(recall, 0, -1, axis=0)))[0]
# 从recall的cutoff 索引值开始往后获取precision最大值,相同的precision只取索引值最大的那个
# P(r) = max{P(r')} | r'>=r
precision_cutoff_index = []
for index in recall_cutoff_index:
precision_cutoff_index.append(
max([x for x in np.where(precision == np.max(precision[index:]))[0] if x >= index]))
# interpolated_idx=np.unique(interpolated_cutoff)
# 从原始的precision和recall中截取对应索引的片段,即可得到 interpolated 方式下的precision,recall以及AP
precision_interpolated = precision[precision_cutoff_index]
recall_interpolated = recall[recall_cutoff_index]
# 以上获得的 recall_cutoff_index 和 precision_cutoff_index 切片包含人为添加的0 和 1(为了画图时与坐标轴封闭)
# 而计算thresholds_interpolated时要去掉相应索引值的影响
# 阈值不包括recall=0
thresholds_interpolated = thresholds[
[x - 1 for x in recall_cutoff_index if 0 <= x - 1 < thresholds.size]]
# 按说ap计算应该按照面积的方式计算,也就是下面被注释的部分,但论文里面是直接计算均值,
# 这里也直接计算均值,因为阈值不包括recall=0,所以这种情况下二者结果是一样的
average_precision = np.mean(precision_interpolated[1:])
# average_precision = np.sum(
# np.diff(recall_interpolated) * np.array(precision_interpolated)[1:])
return precision_interpolated, recall_interpolated, thresholds_interpolated, average_precision
def average_precision_11point_interpolated(y_true, y_predict):
"""
计算 11point形式的 ap
:param y_true: 标签
:param y_predict: 实际预测得分
:return: precision,recall,threshold,average precision
"""
precision, recall, thresholds = precision_recall_curve(
y_true, y_predict, pos_label=1)
recall_11point_cutoff = np.array(
[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
# 计算离11个cutoff最近的样本点
recall_cutoff_index = []
for cutoff in recall_11point_cutoff:
recall_cutoff_index.append(np.where(recall >= cutoff)[0][0])
precision_cutoff_index = []
for index in recall_cutoff_index:
precision_cutoff_index.append(
max([x for x in np.where(precision == np.max(precision[index:]))[0] if x >= index]))
precision_11point = precision[precision_cutoff_index]
recall_11point = recall[recall_cutoff_index]
# 此处阈值包括recall=0,因为是11points
thresholds_11point = thresholds[
[x - 1 for x in recall_cutoff_index if -1 <= x - 1 < thresholds.size]]
# 此处阈值包括recall=0,因为是11points,所以这种情况下两种计算AP的方式结果不同,有略微差别
average_precision = np.mean(precision_11point)
# average_precision = np.sum(
# 0.1 * np.array(precision_11point)[1:])
# 此处直接返回 recall_11point_cutoff,实际上返回 recall_11point 也是可以的,
# 差别就是图线的转折点不在[0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]这11个刻度处
# return precision_11point, recall_11point, thresholds_11point, average_precision
return precision_11point, recall_11point_cutoff, thresholds_11point, average_precision
def main(data=None):
y_test = []
y_score = []
if data == 'iris':
# 还可以导入鸢尾花数据集并构建一个简单的SVM分类器,通过一个完整的模型来理解 PR曲线的绘制
# 使用鸢尾花数据集
y_test, y_score = preprocess_iris_data()
# 计算AP,并画图
precision_approximated, recall_approximated, _, ap_approximated = \
average_precision_approximated(y_test, y_score)
precision_interpolated, recall_interpolated, _, ap_interpolated = \
average_precision_interpolated(y_test, y_score)
precision_11point, recall_11point, _, ap_11point = \
average_precision_11point_interpolated(y_test, y_score)
print('Approximated average precision-recall score: {0:0.5f}'.format(
ap_approximated))
print('Interpolated average precision-recall score: {0:0.5f}'.format(
ap_interpolated))
print('Interpolated at fixed 11 points average precision-recall score: {0:0.5f}'.format(
ap_11point))
# print the AP plot
fig1 = plt.figure('fig1')
# plt.subplot(311)
plt.plot(recall_approximated, precision_approximated,
color='r', marker='o', mec='m', ms=3)
plt.step(recall_approximated, precision_approximated,
color='c', where='pre')
plt.fill_between(recall_approximated, precision_approximated, step='pre', alpha=0.2,
color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1])
plt.title('2-class Precision-Recall curve(Approximated): AP={0:0.5f}'.format(
ap_approximated))
plt.xticks(np.arange(0, 1, 0.1))
plt.yticks(np.arange(0, 1, 0.1))
plt.grid(True)
plt.legend(('PR-curve', 'Approximated-PR-curve', 'Approximated-AP'),
loc='upper right')
fig2 = plt.figure('fig2')
# plt.subplot(312)
plt.plot(recall_approximated, precision_approximated,
color='r', marker='o', mec='m', ms=3)
plt.plot(recall_interpolated, precision_interpolated,
color='c', marker='o', mec='g', ms=3, alpha=0.5)
plt.fill_between(recall_interpolated, precision_interpolated, step='pre', alpha=0.2,
color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1])
plt.title('2-class Precision-Recall curve(Interpolated): AP={0:0.5f}'.format(
ap_interpolated))
plt.xticks(np.arange(0, 1, 0.1))
plt.yticks(np.arange(0, 1, 0.1))
plt.grid(True)
plt.legend(('PR-curve', 'Interpolated-PR-curve', 'Interpolated-AP'),
loc='upper right')
fig3 = plt.figure('fig3')
# plt.subplot(313)
plt.plot(recall_approximated, precision_approximated,
color='r', marker='o', mec='m', ms=3)
plt.plot(recall_11point, precision_11point,
color='c', marker='o', mec='g', ms=3)
plt.fill_between(recall_11point, precision_11point, step='pre', alpha=0.2,
color='b')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1])
plt.title('2-class Precision-Recall curve(Interpolated_11point): AP={0:0.5f}'.format(
ap_11point))
plt.xticks(np.arange(0, 1, 0.1))
plt.yticks(np.arange(0, 1, 0.1))
plt.grid(True)
plt.legend(('PR-curve', '11point-PR-curve', '11point-AP'),
loc='upper right')
# plt.subplots_adjust(top=0.92, bottom=0.08, left=0.10, right=0.95, hspace=0.5,
# wspace=0.35)
plt.show()
if __name__ == '__main__':
# main() # 用这个测试博客 “多标签图像分类任务的评价方法-mAP” 中的简单例程
main('iris')