支持向量机实例
junjun
2016年2月10日
实例一、对鸢尾花使用SVM进行分类
#1、加载数据
data(iris)
#2、创建测试集和训练集数据
index <- sample(1:2, nrow(iris), prob=c(0.8, 0.2), replace=T)
train_iris <- iris[index==1, ]
test_iris <- iris[index==2, ]
#3、建模
library(e1071)
model_iris <- svm(Species~., data=train_iris, type="C-classification", cost=10, kernal="radial", gamma=0.1, scale=F)
#4、模型评估
model_iris
##
## Call:
## svm(formula = Species ~ ., data = train_iris, type = "C-classification",
## cost = 10, kernal = "radial", gamma = 0.1, scale = F)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
## gamma: 0.1
##
## Number of Support Vectors: 26
pred <- predict(model_iris, train_iris)
mean(pred==train_iris[, 5])
## [1] 0.983871
table(pred, train_iris[, 5])
##
## pred setosa versicolor virginica
## setosa 44 0 0
## versicolor 0 39 0
## virginica 0 2 39
#5、预测
pred_iris <- predict(model_iris, test_iris)
mean(pred_iris==test_iris[, 5])
## [1] 1
table(pred_iris, test_iris[, 5])
##
## pred_iris setosa versicolor virginica
## setosa 6 0 0
## versicolor 0 9 0
## virginica 0 0 11
#6、修改cost值来查看新的结果,默认为10-fold CV
model_iris1 <- svm(Species~., train_iris, kernal="radial", cost=0.1, scale = F)
pred1 <- predict(model_iris1, test_iris)
mean(pred1==test_iris[, 5])
## [1] 0.9615385
table(pred1, test_iris[, 5])
##
## pred1 setosa versicolor virginica
## setosa 6 0 0
## versicolor 0 9 1
## virginica 0 0 10
#7、使用tune()函数调整cost的值,默认为10-fold CV
model_tune <- tune(svm, Species~., data=train_iris, kernal="radial", ranges=list(cost=c(0.001, 0.01, 0.1, 1, 5, 10, 100)))
summary(model_tune)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 1
##
## - best performance: 0.04871795
##
## - Detailed performance results:
## cost error dispersion
## 1 1e-03 0.73205128 0.11830739
## 2 1e-02 0.73205128 0.11830739
## 3 1e-01 0.11282051 0.12390215
## 4 1e+00 0.04871795 0.05751000
## 5 5e+00 0.05705128 0.05567459
## 6 1e+01 0.05705128 0.06813861
## 7 1e+02 0.05641026 0.06619105
str(model_tune)
## List of 8
## $ best.parameters :'data.frame': 1 obs. of 1 variable:
## ..$ cost: num 1
## ..- attr(*, "out.attrs")=List of 2
## .. ..$ dim : Named int 7
## .. .. ..- attr(*, "names")= chr "cost"
## .. ..$ dimnames:List of 1
## .. .. ..$ cost: chr [1:7] "cost=1e-03" "cost=1e-02" "cost=1e-01" "cost=1e+00" ...
## $ best.performance: num 0.0487
## $ method : chr "svm"
## $ nparcomb : int 7
## $ train.ind :List of 10
## ..$ (0.877,13.3]: int [1:111] 103 113 64 9 26 74 90 20 85 78 ...
## ..$ (13.3,25.6] : int [1:112] 111 66 91 15 14 63 46 124 119 12 ...
## ..$ (25.6,37.9] : int [1:112] 111 66 91 15 14 63 46 124 119 12 ...
## ..$ (37.9,50.2] : int [1:111] 111 66 91 15 14 63 46 124 119 12 ...
## ..$ (50.2,62.5] : int [1:112] 111 66 91 15 14 63 46 124 119 12 ...
## ..$ (62.5,74.8] : int [1:112] 111 66 91 15 14 63 46 124 119 12 ...
## ..$ (74.8,87.1] : int [1:111] 111 66 91 15 14 63 46 124 119 12 ...
## ..$ (87.1,99.4] : int [1:112] 111 66 91 15 14 63 46 124 119 12 ...
## ..$ (99.4,112] : int [1:112] 111 66 91 15 14 63 46 124 119 12 ...
## ..$ (112,124] : int [1:111] 111 66 91 15 14 63 46 124 119 12 ...
## ..- attr(*, "dim")= int 10
## ..- attr(*, "dimnames")=List of 1
## .. ..$ : chr [1:10] "(0.877,13.3]" "(13.3,25.6]" "(25.6,37.9]" "(37.9,50.2]" ...
## $ sampling : chr "10-fold cross validation"
## $ performances :'data.frame': 7 obs. of 3 variables:
## ..$ cost : num [1:7] 1e-03 1e-02 1e-01 1e+00 5e+00 1e+01 1e+02
## ..$ error : num [1:7] 0.7321 0.7321 0.1128 0.0487 0.0571 ...
## ..$ dispersion: num [1:7] 0.1183 0.1183 0.1239 0.0575 0.0557 ...
## $ best.model :List of 30
## ..$ call : language best.tune(method = svm, train.x = Species ~ ., data = train_iris, ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)), kernal = "radial")
## ..$ type : num 0
## ..$ kernel : num 2
## ..$ cost : num 1
## ..$ degree : num 3
## ..$ gamma : num 0.25
## ..$ coef0 : num 0
## ..$ nu : num 0.5
## ..$ epsilon : num 0.1
## ..$ sparse : logi FALSE
## ..$ scaled : logi [1:4] TRUE TRUE TRUE TRUE
## ..$ x.scale :List of 2
## .. ..$ scaled:center: Named num [1:4] 5.8 3.06 3.66 1.15
## .. .. ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## .. ..$ scaled:scale : Named num [1:4] 0.82 0.437 1.751 0.755
## .. .. ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## ..$ y.scale : NULL
## ..$ nclasses : int 3
## ..$ levels : chr [1:3] "setosa" "versicolor" "virginica"
## ..$ tot.nSV : int 46
## ..$ nSV : int [1:3] 8 19 19
## ..$ labels : int [1:3] 1 2 3
## ..$ SV : num [1:46, 1:4] -1.709 -0.124 -0.49 -0.855 -0.977 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:46] "9" "16" "21" "24" ...
## .. .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## ..$ index : int [1:46] 9 15 19 21 23 29 37 38 45 46 ...
## ..$ rho : num [1:3] -0.0606 0.0842 0.0634
## ..$ compprob : logi FALSE
## ..$ probA : NULL
## ..$ probB : NULL
## ..$ sigma : NULL
## ..$ coefs : num [1:46, 1:2] 0.0895 0.8159 0 0.6514 0.6058 ...
## ..$ na.action : NULL
## ..$ fitted : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
## .. ..- attr(*, "names")= chr [1:124] "1" "2" "3" "4" ...
## ..$ decision.values: num [1:124, 1:3] 1.19 1.06 1.17 1.1 1.17 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:124] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:3] "setosa/versicolor" "setosa/virginica" "versicolor/virginica"
## ..$ terms :Classes 'terms', 'formula' length 3 Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width
## .. .. ..- attr(*, "variables")= language list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
## .. .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:5] "Species" "Sepal.Length" "Sepal.Width" "Petal.Length" ...
## .. .. .. .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## .. .. ..- attr(*, "term.labels")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## .. .. ..- attr(*, "order")= int [1:4] 1 1 1 1
## .. .. ..- attr(*, "intercept")= num 0
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:5] "factor" "numeric" "numeric" "numeric" ...
## .. .. .. ..- attr(*, "names")= chr [1:5] "Species" "Sepal.Length" "Sepal.Width" "Petal.Length" ...
## ..- attr(*, "class")= chr [1:2] "svm.formula" "svm"
## - attr(*, "class")= chr "tune"
#获取最好的模型
model_best <- model_tune$best.model
summary(model_best)
##
## Call:
## best.tune(method = svm, train.x = Species ~ ., data = train_iris,
## ranges = list(cost = c(0.001, 0.01, 0.1, 1, 5, 10, 100)),
## kernal = "radial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
## gamma: 0.25
##
## Number of Support Vectors: 46
##
## ( 8 19 19 )
##
##
## Number of Classes: 3
##
## Levels:
## setosa versicolor virginica
#预测
pred_tune <- predict(model_best, test_iris)
mean(pred_tune==test_iris[, 5])
## [1] 1
table(pred_tune, test_iris[, 5])
##
## pred_tune setosa versicolor virginica
## setosa 6 0 0
## versicolor 0 9 0
## virginica 0 0 11
实例二:寻找最优参数
使用支持向量机实现二元分类器,使用的数据是来自MASS包的cats数据集。在本例中你将尝试使用体重和心脏重量来预测一只猫的性别。
#1、加载数据
data(cats, package="MASS")
str(cats)
## 'data.frame': 144 obs. of 3 variables:
## $ Sex: Factor w/ 2 levels "F","M": 1 1 1 1 1 1 1 1 1 1 ...
## $ Bwt: num 2 2 2 2.1 2.1 2.1 2.1 2.1 2.1 2.1 ...
## $ Hwt: num 7 7.4 9.5 7.2 7.3 7.6 8.1 8.2 8.3 8.5 ...
summary(cats)
## Sex Bwt Hwt
## F:47 Min. :2.000 Min. : 6.30
## M:97 1st Qu.:2.300 1st Qu.: 8.95
## Median :2.700 Median :10.10
## Mean :2.724 Mean :10.63
## 3rd Qu.:3.025 3rd Qu.:12.12
## Max. :3.900 Max. :20.50
#2、创建训练集和测试集数据
index <- sample(1:2, nrow(cats), prob=c(0.7, 0.3), replace=T)
train_cats <- cats[index==1, ]
test_cats <- cats[index==2, ]
#3、建模
library(e1071)
#1)线性核函数linear SVM
model_linear <- svm(Sex~., train_cats, kernal="linear", cost=10, scale = F)
#预测
pred <- predict(model_linear, test_cats[])
mean(pred==test_cats$Sex)
## [1] 0.7560976
table(pred, test_cats$Sex)
##
## pred F M
## F 7 6
## M 4 24
print(model_linear)
##
## Call:
## svm(formula = Sex ~ ., data = train_cats, kernal = "linear",
## cost = 10, scale = F)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
## gamma: 0.5
##
## Number of Support Vectors: 55
#2)创建径向支持向量机:radial SVM
model_radial <- svm(Sex~., train_cats, kernal="radial", cost=10, scale=F)
#预测
pred <- predict(model_radial, test_cats)
mean(pred==test_cats$Sex)
## [1] 0.7560976
table(pred, test_cats$Sex)
##
## pred F M
## F 7 6
## M 4 24
print(model_radial)
##
## Call:
## svm(formula = Sex ~ ., data = train_cats, kernal = "radial",
## cost = 10, scale = F)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 10
## gamma: 0.5
##
## Number of Support Vectors: 55
#4、寻找最优参数:可以使用tune.svm()函数,来寻找svm()函数的最优参数。
model_tuned <- tune.svm(Sex~., data=train_cats, gamma=10^(-6:-1),cost=10^(1:2))
summary(model_tuned)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## gamma cost
## 0.001 100
##
## - best performance: 0.2172727
##
## - Detailed performance results:
## gamma cost error dispersion
## 1 1e-06 10 0.3463636 0.2105330
## 2 1e-05 10 0.3463636 0.2105330
## 3 1e-04 10 0.3463636 0.2105330
## 4 1e-03 10 0.3681818 0.2027474
## 5 1e-02 10 0.2272727 0.1177537
## 6 1e-01 10 0.2363636 0.1363973
## 7 1e-06 100 0.3463636 0.2105330
## 8 1e-05 100 0.3463636 0.2105330
## 9 1e-04 100 0.3681818 0.2027474
## 10 1e-03 100 0.2172727 0.1243865
## 11 1e-02 100 0.2272727 0.1432920
## 12 1e-01 100 0.2354545 0.1426143
#结果证明,当cost为10,gamma为0.1时产生最小的错误率。利用这些参数训练径向支持向量机。
#5、利用调整后的最优参数:cost=10, gamma=0.1重新建模
model_cats <- svm(Sex~., train_cats, kernal="radial", cost=10, gamma=0.1, scale = F)
#6、预测
pred <- predict(model_cats, test_cats)
mean(pred==test_cats$Sex)
## [1] 0.7560976
table(pred, test_cats$Sex)
##
## pred F M
## F 8 7
## M 3 23
实例三:kernlab包中的ksvm()函数实现支持向量机
#1、加载数据
data(iris)
#2、创建训练集和测试集数据
index <- sample(1:2, nrow(iris), prob=c(0.7, 0.3), replace = T)
train_iris <- iris[index==1, ]
test_iris <- iris[index==2, ]
#3、建模
library(kernlab)
model <- ksvm(Species~., train_iris, kernal="rbfdot", type="C-bsvc", kpar=list(sigma=0.1), C=10, prob.model=T)
#4、预测
pred <- predict(model, test_iris)
mean(pred==test_iris[, 5])
## [1] 0.9642857
table(pred, test_iris[, 5])
##
## pred setosa versicolor virginica
## setosa 18 0 0
## versicolor 0 19 0
## virginica 0 2 17