决策树
junjun
2016年2月8日
#MarkDown脚本及数据集下载:http://pan.baidu.com/s/1dEx4vgX
实例一、rpart包对iris数据进行分类
#1、加载数据并查看
data("iris")
str(iris)
## 'data.frame': 150 obs. of 5 variables:
## $ Sepal.Length: num 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
## $ Sepal.Width : num 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
## $ Petal.Length: num 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
## $ Petal.Width : num 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
## $ Species : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## Min. :4.300 Min. :2.000 Min. :1.000 Min. :0.100
## 1st Qu.:5.100 1st Qu.:2.800 1st Qu.:1.600 1st Qu.:0.300
## Median :5.800 Median :3.000 Median :4.350 Median :1.300
## Mean :5.843 Mean :3.057 Mean :3.758 Mean :1.199
## 3rd Qu.:6.400 3rd Qu.:3.300 3rd Qu.:5.100 3rd Qu.:1.800
## Max. :7.900 Max. :4.400 Max. :6.900 Max. :2.500
## Species
## setosa :50
## versicolor:50
## virginica :50
##
##
##
#总结:从上可知,共计150条记录,5个变量,其中,4个自变量为数字类型,因变量为因子类型;无缺失值
#2、创建训练集和测试集数据
set.seed(123)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
index <- createDataPartition(iris$Species, p=0.8, list=F)
train_iris <- iris[index, ]
test_iris <- iris[-index, ]
#3、建模
library(rpart)
model_iris <- rpart(Species~., data=train_iris, control = rpart.control(minsplit = 10))
#4、模型评估
str(model_iris)
## List of 14
## $ frame :'data.frame': 5 obs. of 9 variables:
## ..$ var : Factor w/ 2 levels "<leaf>","Petal.Length": 2 1 2 1 1
## ..$ n : int [1:5] 120 40 80 39 41
## ..$ wt : num [1:5] 120 40 80 39 41
## ..$ dev : num [1:5] 80 0 40 1 2
## ..$ yval : num [1:5] 1 1 2 2 3
## ..$ complexity: num [1:5] 0.5 0.01 0.463 0 0
## ..$ ncompete : int [1:5] 3 0 3 0 0
## ..$ nsurrogate: int [1:5] 3 0 3 0 0
## ..$ yval2 : num [1:5, 1:8] 1 1 2 2 3 40 40 0 0 0 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : NULL
## .. .. ..$ : chr [1:8] "" "" "" "" ...
## $ where : Named int [1:120] 2 2 2 2 2 2 2 2 2 2 ...
## ..- attr(*, "names")= chr [1:120] "1" "2" "3" "4" ...
## $ call : language rpart(formula = Species ~ ., data = train_iris, control = rpart.control(minsplit = 10))
## $ terms :Classes 'terms', 'formula' length 3 Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width
## .. ..- attr(*, "variables")= language list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
## .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:5] "Species" "Sepal.Length" "Sepal.Width" "Petal.Length" ...
## .. .. .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## .. ..- attr(*, "term.labels")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## .. ..- attr(*, "order")= int [1:4] 1 1 1 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
## .. ..- attr(*, "dataClasses")= Named chr [1:5] "factor" "numeric" "numeric" "numeric" ...
## .. .. ..- attr(*, "names")= chr [1:5] "Species" "Sepal.Length" "Sepal.Width" "Petal.Length" ...
## $ cptable : num [1:3, 1:5] 0.5 0.463 0.01 0 1 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:3] "1" "2" "3"
## .. ..$ : chr [1:5] "CP" "nsplit" "rel error" "xerror" ...
## $ method : chr "class"
## $ parms :List of 3
## ..$ prior: num [1:3(1d)] 0.333 0.333 0.333
## .. ..- attr(*, "dimnames")=List of 1
## .. .. ..$ : chr [1:3] "1" "2" "3"
## ..$ loss : num [1:3, 1:3] 0 1 1 1 0 1 1 1 0
## ..$ split: num 1
## $ control :List of 9
## ..$ minsplit : num 10
## ..$ minbucket : num 3
## ..$ cp : num 0.01
## ..$ maxcompete : int 4
## ..$ maxsurrogate : int 5
## ..$ usesurrogate : int 2
## ..$ surrogatestyle: int 0
## ..$ maxdepth : int 30
## ..$ xval : int 10
## $ functions :List of 3
## ..$ summary:function (yval, dev, wt, ylevel, digits)
## ..$ print :function (yval, ylevel, digits)
## ..$ text :function (yval, dev, wt, ylevel, digits, n, use.n)
## $ numresp : int 5
## $ splits : num [1:14, 1:5] 120 120 120 120 0 0 0 80 80 80 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:14] "Petal.Length" "Petal.Width" "Sepal.Length" "Sepal.Width" ...
## .. ..$ : chr [1:5] "count" "ncat" "improve" "index" ...
## $ variable.importance: Named num [1:4] 74.2 69.9 44.8 30.8
## ..- attr(*, "names")= chr [1:4] "Petal.Length" "Petal.Width" "Sepal.Length" "Sepal.Width"
## $ y : int [1:120] 1 1 1 1 1 1 1 1 1 1 ...
## $ ordered : Named logi [1:4] FALSE FALSE FALSE FALSE
## ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
## - attr(*, "xlevels")= Named list()
## - attr(*, "ylevels")= chr [1:3] "setosa" "versicolor" "virginica"
## - attr(*, "class")= chr "rpart"
summary(model_iris)
## Call:
## rpart(formula = Species ~ ., data = train_iris, control = rpart.control(minsplit = 10))
## n= 120
##
## CP nsplit rel error xerror xstd
## 1 0.5000 0 1.0000 1.1625 0.05717982
## 2 0.4625 1 0.5000 0.7500 0.06846532
## 3 0.0100 2 0.0375 0.0500 0.02457980
##
## Variable importance
## Petal.Length Petal.Width Sepal.Length Sepal.Width
## 34 32 20 14
##
## Node number 1: 120 observations, complexity param=0.5
## predicted class=setosa expected loss=0.6666667 P(node) =1
## class counts: 40 40 40
## probabilities: 0.333 0.333 0.333
## left son=2 (40 obs) right son=3 (80 obs)
## Primary splits:
## Petal.Length < 2.45 to the left, improve=40.00000, (0 missing)
## Petal.Width < 0.8 to the left, improve=40.00000, (0 missing)
## Sepal.Length < 5.45 to the left, improve=25.97403, (0 missing)
## Sepal.Width < 3.35 to the right, improve=17.24432, (0 missing)
## Surrogate splits:
## Petal.Width < 0.8 to the left, agree=1.000, adj=1.000, (0 split)
## Sepal.Length < 5.45 to the left, agree=0.908, adj=0.725, (0 split)
## Sepal.Width < 3.35 to the right, agree=0.850, adj=0.550, (0 split)
##
## Node number 2: 40 observations
## predicted class=setosa expected loss=0 P(node) =0.3333333
## class counts: 40 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 3: 80 observations, complexity param=0.4625
## predicted class=versicolor expected loss=0.5 P(node) =0.6666667
## class counts: 0 40 40
## probabilities: 0.000 0.500 0.500
## left son=6 (39 obs) right son=7 (41 obs)
## Primary splits:
## Petal.Length < 4.75 to the left, improve=34.246400, (0 missing)
## Petal.Width < 1.65 to the left, improve=30.798240, (0 missing)
## Sepal.Length < 6.15 to the left, improve= 8.120301, (0 missing)
## Sepal.Width < 2.45 to the left, improve= 3.657143, (0 missing)
## Surrogate splits:
## Petal.Width < 1.75 to the left, agree=0.938, adj=0.872, (0 split)
## Sepal.Length < 6.15 to the left, agree=0.738, adj=0.462, (0 split)
## Sepal.Width < 2.95 to the left, agree=0.638, adj=0.256, (0 split)
##
## Node number 6: 39 observations
## predicted class=versicolor expected loss=0.02564103 P(node) =0.325
## class counts: 0 38 1
## probabilities: 0.000 0.974 0.026
##
## Node number 7: 41 observations
## predicted class=virginica expected loss=0.04878049 P(node) =0.3416667
## class counts: 0 2 39
## probabilities: 0.000 0.049 0.951
#5、模型优化:选择具有最小误差的决策树
opt <- which.min(model_iris$cptable[, "xerror"])
cp <- model_iris$cptable[opt, "CP"]
model_cp <- prune(model_iris, cp=cp)
print(model_cp)
## n= 120
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 120 80 setosa (0.33333333 0.33333333 0.33333333)
## 2) Petal.Length< 2.45 40 0 setosa (1.00000000 0.00000000 0.00000000) *
## 3) Petal.Length>=2.45 80 40 versicolor (0.00000000 0.50000000 0.50000000)
## 6) Petal.Length< 4.75 39 1 versicolor (0.00000000 0.97435897 0.02564103) *
## 7) Petal.Length>=4.75 41 2 virginica (0.00000000 0.04878049 0.95121951) *
#6、预测
pred <- predict(model_cp, test_iris, type="class")
plot(model_cp)
text(model_cp)
#7、查看预测错误的比率
mean(test_iris[, 5]!=pred)
## [1] 0.1333333
table(pred, test_iris[, 5])
##
## pred setosa versicolor virginica
## setosa 10 0 0
## versicolor 0 6 0
## virginica 0 4 10
library(gmodels)
CrossTable(pred, test_iris[, 5])
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 30
##
##
## | test_iris[, 5]
## pred | setosa | versicolor | virginica | Row Total |
## -------------|------------|------------|------------|------------|
## setosa | 10 | 0 | 0 | 10 |
## | 13.333 | 3.333 | 3.333 | |
## | 1.000 | 0.000 | 0.000 | 0.333 |
## | 1.000 | 0.000 | 0.000 | |
## | 0.333 | 0.000 | 0.000 | |
## -------------|------------|------------|------------|------------|
## versicolor | 0 | 6 | 0 | 6 |
## | 2.000 | 8.000 | 2.000 | |
## | 0.000 | 1.000 | 0.000 | 0.200 |
## | 0.000 | 0.600 | 0.000 | |
## | 0.000 | 0.200 | 0.000 | |
## -------------|------------|------------|------------|------------|
## virginica | 0 | 4 | 10 | 14 |
## | 4.667 | 0.095 | 6.095 | |
## | 0.000 | 0.286 | 0.714 | 0.467 |
## | 0.000 | 0.400 | 1.000 | |
## | 0.000 | 0.133 | 0.333 | |
## -------------|------------|------------|------------|------------|
## Column Total | 10 | 10 | 10 | 30 |
## | 0.333 | 0.333 | 0.333 | |
## -------------|------------|------------|------------|------------|
##
##
实例二、C50包中的C5.0()函数对德国信用数据分类
#1、加载数据并查看
credit <- read.csv("F:\\R\\Rworkspace\\机器学习实践课\\德国信用数据/german.data", header=F, sep=" ", stringsAsFactors=F)
str(credit)
## 'data.frame': 1000 obs. of 21 variables:
## $ V1 : chr "A11" "A12" "A14" "A11" ...
## $ V2 : int 6 48 12 42 24 36 24 36 12 30 ...
## $ V3 : chr "A34" "A32" "A34" "A32" ...
## $ V4 : chr "A43" "A43" "A46" "A42" ...
## $ V5 : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ V6 : chr "A65" "A61" "A61" "A61" ...
## $ V7 : chr "A75" "A73" "A74" "A74" ...
## $ V8 : int 4 2 2 2 3 2 3 2 2 4 ...
## $ V9 : chr "A93" "A92" "A93" "A93" ...
## $ V10: chr "A101" "A101" "A101" "A103" ...
## $ V11: int 4 2 3 4 4 4 4 2 4 2 ...
## $ V12: chr "A121" "A121" "A121" "A122" ...
## $ V13: int 67 22 49 45 53 35 53 35 61 28 ...
## $ V14: chr "A143" "A143" "A143" "A143" ...
## $ V15: chr "A152" "A152" "A152" "A153" ...
## $ V16: int 2 1 1 1 2 1 1 1 1 2 ...
## $ V17: chr "A173" "A173" "A172" "A173" ...
## $ V18: int 1 1 2 2 2 2 1 1 1 1 ...
## $ V19: chr "A192" "A191" "A191" "A191" ...
## $ V20: chr "A201" "A201" "A201" "A201" ...
## $ V21: int 1 2 1 1 2 1 1 1 1 2 ...
summary(credit)
## V1 V2 V3 V4
## Length:1000 Min. : 4.0 Length:1000 Length:1000
## Class :character 1st Qu.:12.0 Class :character Class :character
## Mode :character Median :18.0 Mode :character Mode :character
## Mean :20.9
## 3rd Qu.:24.0
## Max. :72.0
## V5 V6 V7 V8
## Min. : 250 Length:1000 Length:1000 Min. :1.000
## 1st Qu.: 1366 Class :character Class :character 1st Qu.:2.000
## Median : 2320 Mode :character Mode :character Median :3.000
## Mean : 3271 Mean :2.973
## 3rd Qu.: 3972 3rd Qu.:4.000
## Max. :18424 Max. :4.000
## V9 V10 V11 V12
## Length:1000 Length:1000 Min. :1.000 Length:1000
## Class :character Class :character 1st Qu.:2.000 Class :character
## Mode :character Mode :character Median :3.000 Mode :character
## Mean :2.845
## 3rd Qu.:4.000
## Max. :4.000
## V13 V14 V15 V16
## Min. :19.00 Length:1000 Length:1000 Min. :1.000
## 1st Qu.:27.00 Class :character Class :character 1st Qu.:1.000
## Median :33.00 Mode :character Mode :character Median :1.000
## Mean :35.55 Mean :1.407
## 3rd Qu.:42.00 3rd Qu.:2.000
## Max. :75.00 Max. :4.000
## V17 V18 V19 V20
## Length:1000 Min. :1.000 Length:1000 Length:1000
## Class :character 1st Qu.:1.000 Class :character Class :character
## Mode :character Median :1.000 Mode :character Mode :character
## Mean :1.155
## 3rd Qu.:1.000
## Max. :2.000
## V21
## Min. :1.0
## 1st Qu.:1.0
## Median :1.0
## Mean :1.3
## 3rd Qu.:2.0
## Max. :2.0
#从上可知:共计1000条记录21个变量,有字符类型也有数字类型;每列的属性名为默认名称;
#2、数据预处理
#1)对数据的列进行命名
colnames(credit) <- c("Status.of.existing.checking.account", "Duration.in.month",
"Credit.history", "Purpose", "Credit.amount", "Savings.account.bonds",
"Present.employment.since",
"Installment.rate.in.percentage.of.disposable.income",
"Personal.status.and.sex", "Other.debtors.guarantors",
"Present.residence.since","Property", "Age.in.years",
"Other.installment.plans", "Housing",
"Number.of.existing.credits.at.this.bank", "Job",
"Number.of.people.being.liable.to.provide.maintenance.for",
"Telephone", "foreign.worker", "Good.Loan")
#2)对每列的数据进行映射:
#自定义映射
mapping <- list('All'='... < 0 DM',
'A12'='0 <= ... < 200 DM',
'A13'='... >=200 DM / salary assignment for at least 1 year',
'A14'='no checking accout',
'A30'='no credits taken/all credits paid back duly',
'A31'='all credits at this bank paid back duly',
'A32'='existing credits paid back duly till now',
'A33'='delay in paying off in the past',
'A34'='critical account/other credits existing (not at this bank)',
'A40'='car (new)',
'A41'='car (used)',
'A42'='furniture/equipment',
'A43'='radio/television',
'A44'='domestic appliances',
'A45'='repairs',
'A46'='education',
'A47'='(vacation - does not exist?)',
'A48'='retraining',
'A49'='business',
'A410'='other',
'A61'='... < 100 DM',
'A62'='100 <= ... < 500 DM',
'A63'='500 <= ... < 1000 DM',
'A64'='.. >= 1000 DM',
'A65'='unknown/ no savings account',
'A71'='unemployed',
'A72'='... < 1 year',
'A73'='1 <= ... < 4 years',
'A74'='4 <= ... < 7 years',
'A75'='.. >= 7 years',
'A91'='male : divorced/separated',
'A92'='female : divorced/separated/married',
'A93'='male : single',
'A94'='male : married/widowed',
'A95'='female : single',
'A101'='none',
'A102'='co-applicant',
'A103'='auarantor',
'A121'='real estate',
'A122'='if not A121 : building society savings agreement/life insurance',
'A123'='if not A121/A122 : car or other, not in attribute 6',
'A124'='unknown / no property',
'A141'='bank',
'A142'='stores',
'A143'='none',
'A151'='rent',
'A152'='own',
'A153'='for free',
'A171'='unemployed/ unskilled - non-resident',
'A172'='unskilled - resident',
'A173'='skilled employee / official',
'A174'='management/ self=employed/hightly qualified employee/ officer',
'A191'='none',
'A192'='yes, registered under the customers name',
'A201'='yes',
'A202'='no')
#对每列的数据进行映射
for(i in 1:ncol(credit)) {
if(class(credit[, i])=="character"){
credit[, i] = as.factor(as.character(mapping[credit[, i]]))
}
}
#3)对因变量进行因子转换
credit$Good.Loan <- as.factor(ifelse(credit$Good.Loan==1, "GoodLoan", "BadLoan"))
#查看清洗后的数据:查看各列的频数
table(credit$Status.of.existing.checking.account)
##
## ... >=200 DM / salary assignment for at least 1 year
## 63
## 0 <= ... < 200 DM
## 269
## no checking accout
## 394
## NULL
## 274
table(credit$Savings.account.bonds)
##
## .. >= 1000 DM ... < 100 DM
## 48 603
## 100 <= ... < 500 DM 500 <= ... < 1000 DM
## 103 63
## unknown/ no savings account
## 183
summary(credit$Duration.in.month)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 12.0 18.0 20.9 24.0 72.0
#3、构建测试集和训练集数据
library(caret)
set.seed(2000)
index <- createDataPartition(credit$Good.Loan, p=0.8, list=F)
train_credit <- credit[index, ]
test_credit <- credit[-index, ]
prop.table(table(train_credit$Good.Loan))
##
## BadLoan GoodLoan
## 0.3 0.7
prop.table(table(test_credit$Good.Loan))
##
## BadLoan GoodLoan
## 0.3 0.7
#从上可知:测试集和训练集数据的比率相同
#4、建模
library(C50)
model_credit <- C5.0(train_credit[, -21], train_credit[, 21])
#5、模型评估
model_credit
##
## Call:
## C5.0.default(x = train_credit[, -21], y = train_credit[, 21])
##
## Classification Tree
## Number of samples: 800
## Number of predictors: 20
##
## Tree size: 60
##
## Non-standard options: attempt to group attributes
summary(model_credit)
##
## Call:
## C5.0.default(x = train_credit[, -21], y = train_credit[, 21])
##
##
## C5.0 [Release 2.07 GPL Edition] Mon Feb 08 12:07:02 2016
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 800 cases (21 attributes) from undefined.data
##
## Decision tree:
##
## Status.of.existing.checking.account in {... >=200 DM / salary assignment for at least 1 year,
## : no checking accout}: GoodLoan (368/49)
## Status.of.existing.checking.account in {0 <= ... < 200 DM,NULL}:
## :...Duration.in.month <= 22: [S1]
## Duration.in.month > 22:
## :...Present.residence.since <= 1:
## :...Housing = for free: GoodLoan (3)
## : Housing = rent: BadLoan (1)
## : Housing = own:
## : :...Status.of.existing.checking.account = NULL:
## : :...Credit.amount <= 1943: GoodLoan (2)
## : : Credit.amount > 1943: BadLoan (6/1)
## : Status.of.existing.checking.account = 0 <= ... < 200 DM:
## : :...Telephone = none: GoodLoan (9)
## : Telephone = yes, registered under the customers name:
## : :...Credit.amount <= 3878: BadLoan (2)
## : Credit.amount > 3878: GoodLoan (2)
## Present.residence.since > 1:
## :...Savings.account.bonds = .. >= 1000 DM:
## :...Other.debtors.guarantors = co-applicant: BadLoan (1)
## : Other.debtors.guarantors in {auarantor,none}: GoodLoan (4)
## Savings.account.bonds = 500 <= ... < 1000 DM:
## :...Other.installment.plans in {bank,none}: BadLoan (4)
## : Other.installment.plans = stores: GoodLoan (2)
## Savings.account.bonds = unknown/ no savings account: [S2]
## Savings.account.bonds = 100 <= ... < 500 DM:
## :...Other.debtors.guarantors in {auarantor,
## : : co-applicant}: BadLoan (2)
## : Other.debtors.guarantors = none:
## : :...Duration.in.month > 54: BadLoan (3)
## : Duration.in.month <= 54: [S3]
## Savings.account.bonds = ... < 100 DM:
## :...Other.debtors.guarantors = auarantor: [S4]
## Other.debtors.guarantors in {co-applicant,none}:
## :...Duration.in.month > 45: BadLoan (23/1)
## Duration.in.month <= 45:
## :...Purpose in {car (new),domestic appliances,education,
## : other,repairs,
## : retraining}: BadLoan (20/3)
## Purpose = business: [S5]
## Purpose = car (used):
## :...Age.in.years <= 28: BadLoan (4)
## : Age.in.years > 28: GoodLoan (11/1)
## Purpose = furniture/equipment: [S6]
## Purpose = radio/television: [S7]
##
## SubTree [S1]
##
## Credit.history = delay in paying off in the past: GoodLoan (13/1)
## Credit.history in {all credits at this bank paid back duly,
## : no credits taken/all credits paid back duly}: BadLoan (21/6)
## Credit.history = critical account/other credits existing (not at this bank):
## :...Other.installment.plans = stores: GoodLoan (1)
## : Other.installment.plans = bank: [S8]
## : Other.installment.plans = none:
## : :...Other.debtors.guarantors = auarantor: GoodLoan (3)
## : Other.debtors.guarantors = co-applicant: BadLoan (3/1)
## : Other.debtors.guarantors = none:
## : :...Purpose in {business,car (new),car (used),domestic appliances,
## : : furniture/equipment,other,radio/television,
## : : retraining}: GoodLoan (44/3)
## : Purpose in {education,repairs}: BadLoan (3)
## Credit.history = existing credits paid back duly till now:
## :...Savings.account.bonds = .. >= 1000 DM: GoodLoan (8)
## Savings.account.bonds in {... < 100 DM,100 <= ... < 500 DM,
## : 500 <= ... < 1000 DM,unknown/ no savings account}: [S9]
##
## SubTree [S2]
##
## Status.of.existing.checking.account = 0 <= ... < 200 DM: GoodLoan (13/1)
## Status.of.existing.checking.account = NULL: BadLoan (13/3)
##
## SubTree [S3]
##
## Telephone = yes, registered under the customers name: GoodLoan (6)
## Telephone = none: [S10]
##
## SubTree [S4]
##
## Status.of.existing.checking.account = 0 <= ... < 200 DM: BadLoan (2)
## Status.of.existing.checking.account = NULL: GoodLoan (6)
##
## SubTree [S5]
##
## Property in {if not A121 . building society savings agreement/life insurance,
## : real estate,unknown / no property}: BadLoan (6)
## Property = if not A121/A122 . car or other, not in attribute 6: GoodLoan (2)
##
## SubTree [S6]
##
## Present.employment.since in {.. >= 7 years,... < 1 year,
## : 4 <= ... < 7 years}: BadLoan (14)
## Present.employment.since in {1 <= ... < 4 years,unemployed}:
## :...Status.of.existing.checking.account = 0 <= ... < 200 DM: BadLoan (1)
## Status.of.existing.checking.account = NULL: GoodLoan (3)
##
## SubTree [S7]
##
## Status.of.existing.checking.account = 0 <= ... < 200 DM: GoodLoan (4)
## Status.of.existing.checking.account = NULL:
## :...Present.employment.since = .. >= 7 years: GoodLoan (2)
## Present.employment.since in {... < 1 year,4 <= ... < 7 years,
## : unemployed}: BadLoan (6)
## Present.employment.since = 1 <= ... < 4 years:
## :...Credit.amount <= 2835: BadLoan (2)
## Credit.amount > 2835: GoodLoan (2)
##
## SubTree [S8]
##
## Job in {management/ self=employed/hightly qualified employee/ officer,
## : unemployed/ unskilled - non-resident}: BadLoan (0)
## Job = unskilled - resident: GoodLoan (3)
## Job = skilled employee / official:
## :...Purpose = business: GoodLoan (1)
## Purpose in {car (new),car (used),domestic appliances,education,
## furniture/equipment,other,radio/television,repairs,
## retraining}: BadLoan (5)
##
## SubTree [S9]
##
## Property = real estate: GoodLoan (57/14)
## Property = unknown / no property:
## :...Age.in.years <= 56: BadLoan (5)
## : Age.in.years > 56: GoodLoan (3/1)
## Property = if not A121 . building society savings agreement/life insurance:
## :...Number.of.existing.credits.at.this.bank > 1: GoodLoan (2)
## : Number.of.existing.credits.at.this.bank <= 1: [S11]
## Property = if not A121/A122 . car or other, not in attribute 6:
## :...Credit.amount <= 1386:
## :...Present.employment.since = .. >= 7 years: GoodLoan (2)
## : Present.employment.since in {... < 1 year,1 <= ... < 4 years,
## : 4 <= ... < 7 years,
## : unemployed}: BadLoan (13)
## Credit.amount > 1386:
## :...Other.debtors.guarantors = auarantor: GoodLoan (0)
## Other.debtors.guarantors = co-applicant: BadLoan (1)
## Other.debtors.guarantors = none:
## :...Age.in.years > 22: GoodLoan (21/3)
## Age.in.years <= 22: [S12]
##
## SubTree [S10]
##
## Job in {management/ self=employed/hightly qualified employee/ officer,
## : unskilled - resident}: BadLoan (3)
## Job in {skilled employee / official,
## unemployed/ unskilled - non-resident}: GoodLoan (8/2)
##
## SubTree [S11]
##
## Status.of.existing.checking.account = 0 <= ... < 200 DM: BadLoan (9/2)
## Status.of.existing.checking.account = NULL: GoodLoan (18/7)
##
## SubTree [S12]
##
## Installment.rate.in.percentage.of.disposable.income <= 1: GoodLoan (2)
## Installment.rate.in.percentage.of.disposable.income > 1: BadLoan (2)
##
##
## Evaluation on training data (800 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 58 99(12.4%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 158 82 (a): class BadLoan
## 17 543 (b): class GoodLoan
##
##
## Attribute usage:
##
## 100.00% Status.of.existing.checking.account
## 54.00% Duration.in.month
## 38.75% Savings.account.bonds
## 30.00% Credit.history
## 26.75% Other.debtors.guarantors
## 24.00% Present.residence.since
## 17.88% Property
## 16.25% Purpose
## 8.63% Other.installment.plans
## 7.13% Credit.amount
## 6.00% Age.in.years
## 5.63% Present.employment.since
## 3.75% Telephone
## 3.63% Number.of.existing.credits.at.this.bank
## 3.13% Housing
## 2.50% Job
## 0.50% Installment.rate.in.percentage.of.disposable.income
##
##
## Time: 0.0 secs
pred_credit <- predict(model_credit, train_credit)
mean(pred_credit==train_credit[, 21])
## [1] 0.87625
#6、预测
pred <- predict(model_credit, test_credit)
#7、查看预测数据
mean(pred==test_credit[, 21])
## [1] 0.745
table(pred, test_credit[, 21])
##
## pred BadLoan GoodLoan
## BadLoan 26 17
## GoodLoan 34 123
library(gmodels)
CrossTable(pred, test_credit[, 21])
##
##
## Cell Contents
## |-------------------------|
## | N |
## | Chi-square contribution |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 200
##
##
## | test_credit[, 21]
## pred | BadLoan | GoodLoan | Row Total |
## -------------|-----------|-----------|-----------|
## BadLoan | 26 | 17 | 43 |
## | 13.303 | 5.701 | |
## | 0.605 | 0.395 | 0.215 |
## | 0.433 | 0.121 | |
## | 0.130 | 0.085 | |
## -------------|-----------|-----------|-----------|
## GoodLoan | 34 | 123 | 157 |
## | 3.644 | 1.562 | |
## | 0.217 | 0.783 | 0.785 |
## | 0.567 | 0.879 | |
## | 0.170 | 0.615 | |
## -------------|-----------|-----------|-----------|
## Column Total | 60 | 140 | 200 |
## | 0.300 | 0.700 | |
## -------------|-----------|-----------|-----------|
##
##
#从上面列联表可知:错误率为(34+17)/200=25.05%,同时有34/60=57%d bad loan我们归类为good loan。这结果显然不满足我们的要求。
#8、继续模型优化:C5.0中加入了adaptive boosting的支持
#1)使用迭代
model_boot <- C5.0(train_credit[, -21], train_credit[, 21], trials = 10)
pred_boot <- predict(model_boot, test_credit[, -21])
table(pred_boot, test_credit[, 21])
##
## pred_boot BadLoan GoodLoan
## BadLoan 28 19
## GoodLoan 32 121
#从上可知:加入了boost后错误率没变,badloan误判为goodloan的概率为32/60=51%。然而我们知道如果把badloan 归类为goodloan的损失明显比把goodloan归类为badloan要大,因此这里加入一个cost矩阵,即为犯某类错误加上一个系数,比如如果我们按照cookbook里面的方法标记将badloan归类为goodloan的成本为5倍于goodloan归类为badloan。
#2)加入惩罚项
error_cost <- matrix(c(0, 1, 5, 0), nrow=2)
rownames(error_cost) <- c("GoodLoan", "BadLoan")
colnames(error_cost) <- c("GoodLoan", "BadLoan")
model_cost <- C5.0(train_credit[, -21], train_credit[, 21], costs=error_cost)
pred_cost <- predict(model_cost, test_credit)
table(pred_cost, test_credit[, 21])
##
## pred_cost BadLoan GoodLoan
## BadLoan 51 75
## GoodLoan 9 65
#从上可知:总错误率为84/200=42%,但是把badloan归类为goodloan的情况却大大减少了,只有9/60=15%。
#MarkDown脚本及数据集下载: http://pan.baidu.com/s/1dEx4vgX