SVM案例分析：Otto商品分类

最新推荐文章于 2023-09-13 11:04:27 发布

fly_Xiaoma

最新推荐文章于 2023-09-13 11:04:27 发布

阅读量1.8k

点赞数

分类专栏： machineLearning

本文链接：https://blog.csdn.net/weixin_38664232/article/details/87790838

版权

machineLearning 专栏收录该内容

33 篇文章 2 订阅

订阅专栏

一.SVM基础部分

1.工具

import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV

#竞赛的评价指标为logloss，但LinearSVC不支持概率
#所以这个例子中我们用正确率accuracy_score作为模型选择的度量
#如果要将LinearSVC的输出转换成概率，可通过概率校准工具CalibratedClassfierCV实现
#SVC也是铜鼓类似的方式支持概率输出
from sklearn.metrics import accuracy_score

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from matplotlib import pyplot as plt

2.读取数据

#读取数据
dpath='./data/'

#采用原始特征+tf_idf特征
#原始特征+tf_idf特征对线性SVM训练还是很快，RBF核已慢得不行
#RBF核只用tf_idf特征
train1=pd.read_csv(dpath+"Otto_FE_train_org.csv")
train2=pd.read_csv(dpath+"Otto_FE_train_tfidf.csv")

#去掉多余的id
train2=train2.drop(["id"],['targer'],axis=1)
train=pd.concat([train1,train2],axis=1,ignore_index=False)
train.head()

del train1
del train2

3.准备数据

#将类别字符串变成数字
#drop ids and get labels
y_train=train['target']
X_train=train.drop(['id','targer'],axis=1)


#保存特征名字以备后用（可视化）
feat_names=X_train.columns

#sklearn的学习器大多之一稀疏数据输入，模型训练会很快
from scipy.sparse import csr_matrix
X_train=csr_matrix(X_train)


#训练样本6w+，交叉验证太慢，用train_test_split来估计模型性能
#SVM对大样本数据集支持不太好
from sklearn.model_selection import train_test_split
X_train_part,X_val,y_train_part,y_val=train_test_split(X_train,y_trian,
                                train_size=0.8,random_state=0)

print(X_train_part.shape)

数据结果：

4.模型训练

默认参数的SVC

from sklearn.svm import LinearSVC

#LinearSVC不能得到每类的概率（只有predict函数，没有predict_proba函数），Otto数据集
#要求输出每类的概率

#1.生成学习器实例
SVC1=LinearSVC()

#2.模型训练
SVC1.fit(X_train_part,y_train_part)

#3.在校验集上测试，估计模型性能
y_predict=SVC1.predict(X_val)

print('accuracy is :',accuracy_score(y_val,y_predict))

print("Classification report for classifier %s:\n%s\n"%(SVC1,
                classification_report(y_val,y_predict)))

print('Confusion matrix:\n%s'%confusion_matrix(y_val,y_predict))

打印结果：

ccuracy is: 0.7643018745959922
Confusion matrix:
[[ 129 16 3 0 1 41 10 77 93]
[ 3 2762 361 10 12 12 23 14 8]
[ 1 943 535 8 2 2 40 7 8]
[ 0 343 88 86 5 25 15 3 1]
[ 0 17 1 0 519 1 0 3 1]
[ 22 32 5 6 1 2609 46 54 48]
[ 16 71 35 1 3 42 357 41 6]
[ 19 24 7 0 4 42 17 1569 21]
[ 27 20 3 2 2 37 10 55 893]]

使用原始特征+tfidf特征的线性SVM分类性能：accuracy is:

class_1,class_3和class_4分类效果不好。

是因为这几类样本数目少吗？后面可以采用class_weight='balanced'试一下

线性SVM正则参数调优

线性SVM LinearSVC的需要调整正则超参数包括C（正则系数，一般在log域（取log后的值）均匀设置候选参数）和正则函数penalty(L2/L1)

采用交叉验证，网格搜索步骤与Logistic回归正则参数处理类似，在此略。

这里我们用校验集(X_val,y_val)来估计模型性能

#单组超参数情况，模型在训练集上训练，在校验集上的测试的测试性能
def fit_grid_point_Linear(C,X_trian,y_train,X_val,y_val):
    #在训练集上训练SVC
    SVC2=LinearSVC(C=C)
    SVC2=SVC2.fit(X_train,y_train)

    #在校验集上返回accuracy
    accuracy=SVC2.score(X_val,y_val)

    print("C={}: accuracy={}".format(C,accuracy))
    return accuracy

#需要调优的参数
#SVM太慢，每次只调一个参数（这里只调C,penalty为'l2'）
C_s=np.logspace(-1,3,5) #logspace(a,b,N)表示把10的a次方到10的b次方区间分成N份
#penalty_s=['l1','l2']


accuracy_s=[]
for i ,oneC in enumerate(C_s):
    tmp=fit_grid_point_Linear(oneC,X_train_part,X_val,y_train,y_val)
    accuracy_s.append(tmp)

X_axis=np.log10(C_s)
#for j,penalty in enumerate(penalty_s):
plot.plot(X_axis,np.array(accuracy_s),'b-')

plot.legend()
plt.xlabel('log(C)')
plt.ylabel('accuracy')

输出结果为：

最后得到最佳超参数：

#最佳超参数
index=np.argmax(accuracy_s,axis=None)
Best_C=C_s[index]

print(Best_C)

输出结果：

找到最佳参数后，用全体训练数据训练模型

#SVC训练SVC,支持概率输出
Best_C=100

SVC3=LinearSVC(C=Best_C)
SVC3.fit(X_train,y_train)

#保存模型，用于后续测试
import cPickle
cPickle.dump(SVC3,open("Otto_LinearSVC.pkl","wb"))

二.SVM中的RBF核部分

1.读取数据

dpath='./data/'

#原始特征+tf_idf特征对线性SVM训练还是很快，RBF核已慢得不行
#RBF核只用tf_idf特征
train=pd.read_csv(dpath+"Otto_FE_train_tfidf.csv")
print(train.head())

2.准备数据

#将类别字符串变成数字
#drop ids and get labels
y_train=train['target']
X_train=train.drop(['id','targer'],axis=1)


#保存特征名字以备后用（可视化）
feat_names=X_train.columns

#sklearn的学习器大多之一稀疏数据输入，模型训练会很快
from scipy.sparse import csr_matrix
X_train=csr_matrix(X_train)


#训练样本6w+，交叉验证太慢，用train_test_split来估计模型性能
#SVM对大样本数据集支持不太好
from sklearn.model_selection import train_test_split
X_train_part,X_val,y_train_part,y_val=train_test_split(X_train,y_trian,
                                train_size=0.8,random_state=0)

print(X_train_part.shape)

3.模型训练

RBF核SVM正则参数调优

RBF核是SVM最常用的核函数。RBF核SVM的需要调整正则超参数包括C和核函数的宽度gamma ，C越小，决策边界越平滑；gamma越小，决策边界越平滑。

采用交叉验证，网格搜索步骤与Logistic回归正则参数处理类似，在此略

这里我们用校验集（X_val,y_val）来估计模型性能

from sklearn.svm import SVC

def fit_grid_point_RBF(C,gamma,X_train,y_train,X_val,y_val):
    SVC3=SVC(C=C,kernel='rbf',gamma=gamma)
    SVC3=SVC3.fit(X_train,y_train)


    #在校验集上返回accuracy
    accuracy=SVC3.score(X_val,y_val)
    print("C={} and gamma={} :accuracy={}".format(C,gamma,accuracy))
    return accuracy

accuracy_s=np.matrix(np.zeros(shape=(5,3)),float)
gamma_s=np.logspace(-1,1,3)

oneC=0.1
for j,gamma in enumerate(gamma_s):
    accuracy_s[0,j]=fit_grid_point)RBF(oneC,gamma,X_train_part,y_train_part,
                        X_val,y_val)
  
oneC=1
for j,gamma in enumerate:
    accuracy_s[1,j]=fit_grid_point_RBF(oneC,gamma,X_train_part,y_train_part,
                        X_val,y_val)

oneC=10
for j,gamma in enumerate:
    accuracy_s[1,j]=fit_grid_point_RBF(oneC,gamma,X_train_part,y_train_part,
                        X_val,y_val)

oneC=100
for j,gamma in enumerate:
    accuracy_s[1,j]=fit_grid_point_RBF(oneC,gamma,X_train_part,y_train_part,
                        X_val,y_val)

输出结果：

从上述结果会发现，gamma参数非常重要（当gamma=0.1或gamma=100时性能很差），非线性模型比线性模型要好（注意这里只用到了tfidf特征）

但速度慢了不是一点半点（sklearn建议核方法SVM样本数不超过10000）

可以考虑将训练样本分为多个子集，每个子集训练一个RBF核SVM模型，最终融合到一起

Otto_SVM_result=pd.read_csv("Otto_SVM_result.csv",'wb')
accuracy_s1=Otto_SVM_result['accuracy']

C_s=np.logspace（-1,3,5）
gamma_s=np.logspace(-1,1,3)
accuracy_s1=np.array(accuracy_s1).reshape(len(C_s))


x_axis=np.log10(C_s)
for j,gamma in enumerate(gamma_s):
    plt.plot(x_axis,np.array(accuracy_s1[:,j]),
            label='Test-log(gamma)'+str(np.log10(*****
    plt.legend()
    plt.xlabel('log(C）')
    plt.ylabel('accuracy')
    plt.savefig(RBF_SVM_Otto.png')
plt.show()

最佳超参数：

index=np.unravel_index(np.argmx(accuracy_s1,axis=None,accuracy_s1.shape)
Best_C=C_s[index[0]]
Best_gamma=gamma_s[index[1]]

print(Best_C)
print(Best_gamma)

输出结果：

找到最佳参数后，用全体训练数据训练模型

#SVC训练SVC,支持概率输出
Best_C=100
Best_gamma=1.0

SVC4=SVC(C=Best_C,kernel='rbf',gamma=Best_gamma,probability=True)
SVC4.fit(X_train,y_train)


#保持模型，用于后续测试
import cPickle
cPickle.dump(SVC4,open("Otto_RBF_SVC.pkl",'wb')