2.4决策树之决策树实例

实例一、rpart包对iris数据进行分类

#1、加载数据并查看
data("iris")
str(iris)
## 'data.frame':    150 obs. of  5 variables:
##  $ Sepal.Length: num  5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num  3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num  1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num  0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
summary(iris)
##   Sepal.Length    Sepal.Width     Petal.Length    Petal.Width   
##  Min.   :4.300   Min.   :2.000   Min.   :1.000   Min.   :0.100  
##  1st Qu.:5.100   1st Qu.:2.800   1st Qu.:1.600   1st Qu.:0.300  
##  Median :5.800   Median :3.000   Median :4.350   Median :1.300  
##  Mean   :5.843   Mean   :3.057   Mean   :3.758   Mean   :1.199  
##  3rd Qu.:6.400   3rd Qu.:3.300   3rd Qu.:5.100   3rd Qu.:1.800  
##  Max.   :7.900   Max.   :4.400   Max.   :6.900   Max.   :2.500  
##        Species  
##  setosa    :50  
##  versicolor:50  
##  virginica :50  
##                 
##                 
## 
#总结:从上可知,共计150条记录,5个变量,其中,4个自变量为数字类型,因变量为因子类型;无缺失值

#2、创建训练集和测试集数据
set.seed(123)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.2.3
index <- createDataPartition(iris$Species, p=0.8, list=F)
train_iris <- iris[index, ]
test_iris <- iris[-index, ]

#3、建模
library(rpart)
model_iris <- rpart(Species~., data=train_iris, control = rpart.control(minsplit = 10))

#4、模型评估
str(model_iris)
## List of 14
##  $ frame              :'data.frame': 5 obs. of  9 variables:
##   ..$ var       : Factor w/ 2 levels "<leaf>","Petal.Length": 2 1 2 1 1
##   ..$ n         : int [1:5] 120 40 80 39 41
##   ..$ wt        : num [1:5] 120 40 80 39 41
##   ..$ dev       : num [1:5] 80 0 40 1 2
##   ..$ yval      : num [1:5] 1 1 2 2 3
##   ..$ complexity: num [1:5] 0.5 0.01 0.463 0 0
##   ..$ ncompete  : int [1:5] 3 0 3 0 0
##   ..$ nsurrogate: int [1:5] 3 0 3 0 0
##   ..$ yval2     : num [1:5, 1:8] 1 1 2 2 3 40 40 0 0 0 ...
##   .. ..- attr(*, "dimnames")=List of 2
##   .. .. ..$ : NULL
##   .. .. ..$ : chr [1:8] "" "" "" "" ...
##  $ where              : Named int [1:120] 2 2 2 2 2 2 2 2 2 2 ...
##   ..- attr(*, "names")= chr [1:120] "1" "2" "3" "4" ...
##  $ call               : language rpart(formula = Species ~ ., data = train_iris, control = rpart.control(minsplit = 10))
##  $ terms              :Classes 'terms', 'formula' length 3 Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width
##   .. ..- attr(*, "variables")= language list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
##   .. ..- attr(*, "factors")= int [1:5, 1:4] 0 1 0 0 0 0 0 1 0 0 ...
##   .. .. ..- attr(*, "dimnames")=List of 2
##   .. .. .. ..$ : chr [1:5] "Species" "Sepal.Length" "Sepal.Width" "Petal.Length" ...
##   .. .. .. ..$ : chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##   .. ..- attr(*, "term.labels")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##   .. ..- attr(*, "order")= int [1:4] 1 1 1 1
##   .. ..- attr(*, "intercept")= int 1
##   .. ..- attr(*, "response")= int 1
##   .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv> 
##   .. ..- attr(*, "predvars")= language list(Species, Sepal.Length, Sepal.Width, Petal.Length, Petal.Width)
##   .. ..- attr(*, "dataClasses")= Named chr [1:5] "factor" "numeric" "numeric" "numeric" ...
##   .. .. ..- attr(*, "names")= chr [1:5] "Species" "Sepal.Length" "Sepal.Width" "Petal.Length" ...
##  $ cptable            : num [1:3, 1:5] 0.5 0.463 0.01 0 1 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:3] "1" "2" "3"
##   .. ..$ : chr [1:5] "CP" "nsplit" "rel error" "xerror" ...
##  $ method             : chr "class"
##  $ parms              :List of 3
##   ..$ prior: num [1:3(1d)] 0.333 0.333 0.333
##   .. ..- attr(*, "dimnames")=List of 1
##   .. .. ..$ : chr [1:3] "1" "2" "3"
##   ..$ loss : num [1:3, 1:3] 0 1 1 1 0 1 1 1 0
##   ..$ split: num 1
##  $ control            :List of 9
##   ..$ minsplit      : num 10
##   ..$ minbucket     : num 3
##   ..$ cp            : num 0.01
##   ..$ maxcompete    : int 4
##   ..$ maxsurrogate  : int 5
##   ..$ usesurrogate  : int 2
##   ..$ surrogatestyle: int 0
##   ..$ maxdepth      : int 30
##   ..$ xval          : int 10
##  $ functions          :List of 3
##   ..$ summary:function (yval, dev, wt, ylevel, digits)  
##   ..$ print  :function (yval, ylevel, digits)  
##   ..$ text   :function (yval, dev, wt, ylevel, digits, n, use.n)  
##  $ numresp            : int 5
##  $ splits             : num [1:14, 1:5] 120 120 120 120 0 0 0 80 80 80 ...
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:14] "Petal.Length" "Petal.Width" "Sepal.Length" "Sepal.Width" ...
##   .. ..$ : chr [1:5] "count" "ncat" "improve" "index" ...
##  $ variable.importance: Named num [1:4] 74.2 69.9 44.8 30.8
##   ..- attr(*, "names")= chr [1:4] "Petal.Length" "Petal.Width" "Sepal.Length" "Sepal.Width"
##  $ y                  : int [1:120] 1 1 1 1 1 1 1 1 1 1 ...
##  $ ordered            : Named logi [1:4] FALSE FALSE FALSE FALSE
##   ..- attr(*, "names")= chr [1:4] "Sepal.Length" "Sepal.Width" "Petal.Length" "Petal.Width"
##  - attr(*, "xlevels")= Named list()
##  - attr(*, "ylevels")= chr [1:3] "setosa" "versicolor" "virginica"
##  - attr(*, "class")= chr "rpart"
summary(model_iris)
## Call:
## rpart(formula = Species ~ ., data = train_iris, control = rpart.control(minsplit = 10))
##   n= 120 
## 
##       CP nsplit rel error xerror       xstd
## 1 0.5000      0    1.0000 1.1625 0.05717982
## 2 0.4625      1    0.5000 0.7500 0.06846532
## 3 0.0100      2    0.0375 0.0500 0.02457980
## 
## Variable importance
## Petal.Length  Petal.Width Sepal.Length  Sepal.Width 
##           34           32           20           14 
## 
## Node number 1: 120 observations,    complexity param=0.5
##   predicted class=setosa      expected loss=0.6666667  P(node) =1
##     class counts:    40    40    40
##    probabilities: 0.333 0.333 0.333 
##   left son=2 (40 obs) right son=3 (80 obs)
##   Primary splits:
##       Petal.Length < 2.45 to the left,  improve=40.00000, (0 missing)
##       Petal.Width  < 0.8  to the left,  improve=40.00000, (0 missing)
##       Sepal.Length < 5.45 to the left,  improve=25.97403, (0 missing)
##       Sepal.Width  < 3.35 to the right, improve=17.24432, (0 missing)
##   Surrogate splits:
##       Petal.Width  < 0.8  to the left,  agree=1.000, adj=1.000, (0 split)
##       Sepal.Length < 5.45 to the left,  agree=0.908, adj=0.725, (0 split)
##       Sepal.Width  < 3.35 to the right, agree=0.850, adj=0.550, (0 split)
## 
## Node number 2: 40 observations
##   predicted class=setosa      expected loss=0  P(node) =0.3333333
##     class counts:    40     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 3: 80 observations,    complexity param=0.4625
##   predicted class=versicolor  expected loss=0.5  P(node) =0.6666667
##     class counts:     0    40    40
##    probabilities: 0.000 0.500 0.500 
##   left son=6 (39 obs) right son=7 (41 obs)
##   Primary splits:
##       Petal.Length < 4.75 to the left,  improve=34.246400, (0 missing)
##       Petal.Width  < 1.65 to the left,  improve=30.798240, (0 missing)
##       Sepal.Length < 6.15 to the left,  improve= 8.120301, (0 missing)
##       Sepal.Width  < 2.45 to the left,  improve= 3.657143, (0 missing)
##   Surrogate splits:
##       Petal.Width  < 1.75 to the left,  agree=0.938, adj=0.872, (0 split)
##       Sepal.Length < 6.15 to the left,  agree=0.738, adj=0.462, (0 split)
##       Sepal.Width  < 2.95 to the left,  agree=0.638, adj=0.256, (0 split)
## 
## Node number 6: 39 observations
##   predicted class=versicolor  expected loss=0.02564103  P(node) =0.325
##     class counts:     0    38     1
##    probabilities: 0.000 0.974 0.026 
## 
## Node number 7: 41 observations
##   predicted class=virginica   expected loss=0.04878049  P(node) =0.3416667
##     class counts:     0     2    39
##    probabilities: 0.000 0.049 0.951
#5、模型优化:选择具有最小误差的决策树
opt <- which.min(model_iris$cptable[, "xerror"])
cp <- model_iris$cptable[opt, "CP"]

model_cp <- prune(model_iris, cp=cp)
print(model_cp)
## n= 120 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 120 80 setosa (0.33333333 0.33333333 0.33333333)  
##   2) Petal.Length< 2.45 40  0 setosa (1.00000000 0.00000000 0.00000000) *
##   3) Petal.Length>=2.45 80 40 versicolor (0.00000000 0.50000000 0.50000000)  
##     6) Petal.Length< 4.75 39  1 versicolor (0.00000000 0.97435897 0.02564103) *
##     7) Petal.Length>=4.75 41  2 virginica (0.00000000 0.04878049 0.95121951) *
#6、预测
pred <- predict(model_cp, test_iris, type="class")
plot(model_cp)
text(model_cp)

#7、查看预测错误的比率
mean(test_iris[, 5]!=pred)
## [1] 0.1333333
table(pred, test_iris[, 5])
##             
## pred         setosa versicolor virginica
##   setosa         10          0         0
##   versicolor      0          6         0
##   virginica       0          4        10
library(gmodels)
CrossTable(pred, test_iris[, 5])
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  30 
## 
##  
##              | test_iris[, 5] 
##         pred |     setosa | versicolor |  virginica |  Row Total | 
## -------------|------------|------------|------------|------------|
##       setosa |         10 |          0 |          0 |         10 | 
##              |     13.333 |      3.333 |      3.333 |            | 
##              |      1.000 |      0.000 |      0.000 |      0.333 | 
##              |      1.000 |      0.000 |      0.000 |            | 
##              |      0.333 |      0.000 |      0.000 |            | 
## -------------|------------|------------|------------|------------|
##   versicolor |          0 |          6 |          0 |          6 | 
##              |      2.000 |      8.000 |      2.000 |            | 
##              |      0.000 |      1.000 |      0.000 |      0.200 | 
##              |      0.000 |      0.600 |      0.000 |            | 
##              |      0.000 |      0.200 |      0.000 |            | 
## -------------|------------|------------|------------|------------|
##    virginica |          0 |          4 |         10 |         14 | 
##              |      4.667 |      0.095 |      6.095 |            | 
##              |      0.000 |      0.286 |      0.714 |      0.467 | 
##              |      0.000 |      0.400 |      1.000 |            | 
##              |      0.000 |      0.133 |      0.333 |            | 
## -------------|------------|------------|------------|------------|
## Column Total |         10 |         10 |         10 |         30 | 
##              |      0.333 |      0.333 |      0.333 |            | 
## -------------|------------|------------|------------|------------|
## 
## 

实例二、C50包中的C5.0()函数对德国信用数据分类

#1、加载数据并查看
credit <- read.csv("F:\\R\\Rworkspace\\机器学习实践课\\德国信用数据/german.data", header=F, sep=" ", stringsAsFactors=F)
str(credit)
## 'data.frame':    1000 obs. of  21 variables:
##  $ V1 : chr  "A11" "A12" "A14" "A11" ...
##  $ V2 : int  6 48 12 42 24 36 24 36 12 30 ...
##  $ V3 : chr  "A34" "A32" "A34" "A32" ...
##  $ V4 : chr  "A43" "A43" "A46" "A42" ...
##  $ V5 : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ V6 : chr  "A65" "A61" "A61" "A61" ...
##  $ V7 : chr  "A75" "A73" "A74" "A74" ...
##  $ V8 : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ V9 : chr  "A93" "A92" "A93" "A93" ...
##  $ V10: chr  "A101" "A101" "A101" "A103" ...
##  $ V11: int  4 2 3 4 4 4 4 2 4 2 ...
##  $ V12: chr  "A121" "A121" "A121" "A122" ...
##  $ V13: int  67 22 49 45 53 35 53 35 61 28 ...
##  $ V14: chr  "A143" "A143" "A143" "A143" ...
##  $ V15: chr  "A152" "A152" "A152" "A153" ...
##  $ V16: int  2 1 1 1 2 1 1 1 1 2 ...
##  $ V17: chr  "A173" "A173" "A172" "A173" ...
##  $ V18: int  1 1 2 2 2 2 1 1 1 1 ...
##  $ V19: chr  "A192" "A191" "A191" "A191" ...
##  $ V20: chr  "A201" "A201" "A201" "A201" ...
##  $ V21: int  1 2 1 1 2 1 1 1 1 2 ...
summary(credit)
##       V1                  V2            V3                 V4           
##  Length:1000        Min.   : 4.0   Length:1000        Length:1000       
##  Class :character   1st Qu.:12.0   Class :character   Class :character  
##  Mode  :character   Median :18.0   Mode  :character   Mode  :character  
##                     Mean   :20.9                                        
##                     3rd Qu.:24.0                                        
##                     Max.   :72.0                                        
##        V5             V6                 V7                  V8       
##  Min.   :  250   Length:1000        Length:1000        Min.   :1.000  
##  1st Qu.: 1366   Class :character   Class :character   1st Qu.:2.000  
##  Median : 2320   Mode  :character   Mode  :character   Median :3.000  
##  Mean   : 3271                                         Mean   :2.973  
##  3rd Qu.: 3972                                         3rd Qu.:4.000  
##  Max.   :18424                                         Max.   :4.000  
##       V9                V10                 V11            V12           
##  Length:1000        Length:1000        Min.   :1.000   Length:1000       
##  Class :character   Class :character   1st Qu.:2.000   Class :character  
##  Mode  :character   Mode  :character   Median :3.000   Mode  :character  
##                                        Mean   :2.845                     
##                                        3rd Qu.:4.000                     
##                                        Max.   :4.000                     
##       V13            V14                V15                 V16       
##  Min.   :19.00   Length:1000        Length:1000        Min.   :1.000  
##  1st Qu.:27.00   Class :character   Class :character   1st Qu.:1.000  
##  Median :33.00   Mode  :character   Mode  :character   Median :1.000  
##  Mean   :35.55                                         Mean   :1.407  
##  3rd Qu.:42.00                                         3rd Qu.:2.000  
##  Max.   :75.00                                         Max.   :4.000  
##      V17                 V18            V19                V20           
##  Length:1000        Min.   :1.000   Length:1000        Length:1000       
##  Class :character   1st Qu.:1.000   Class :character   Class :character  
##  Mode  :character   Median :1.000   Mode  :character   Mode  :character  
##                     Mean   :1.155                                        
##                     3rd Qu.:1.000                                        
##                     Max.   :2.000                                        
##       V21     
##  Min.   :1.0  
##  1st Qu.:1.0  
##  Median :1.0  
##  Mean   :1.3  
##  3rd Qu.:2.0  
##  Max.   :2.0
#从上可知:共计1000条记录21个变量,有字符类型也有数字类型;每列的属性名为默认名称;

#2、数据预处理
#1)对数据的列进行命名
colnames(credit) <- c("Status.of.existing.checking.account", "Duration.in.month", 
"Credit.history", "Purpose", "Credit.amount", "Savings.account.bonds", 
"Present.employment.since", 
"Installment.rate.in.percentage.of.disposable.income",
"Personal.status.and.sex", "Other.debtors.guarantors", 
"Present.residence.since","Property", "Age.in.years", 
"Other.installment.plans", "Housing", 
"Number.of.existing.credits.at.this.bank", "Job", 
"Number.of.people.being.liable.to.provide.maintenance.for", 
"Telephone", "foreign.worker", "Good.Loan")

#2)对每列的数据进行映射:
#自定义映射
mapping <- list('All'='... < 0 DM', 
'A12'='0 <= ... < 200 DM', 
'A13'='... >=200 DM / salary assignment for at least 1 year',
'A14'='no checking accout', 
'A30'='no credits taken/all credits paid back duly',
'A31'='all credits at this bank paid back duly',
'A32'='existing credits paid back duly till now', 
'A33'='delay in paying off in the past', 
'A34'='critical account/other credits existing (not at this bank)',
'A40'='car (new)',
'A41'='car (used)',
'A42'='furniture/equipment',
'A43'='radio/television', 
'A44'='domestic appliances',
'A45'='repairs',
'A46'='education',
'A47'='(vacation - does not exist?)',
'A48'='retraining', 
'A49'='business', 
'A410'='other', 
'A61'='... < 100 DM', 
'A62'='100 <= ... < 500 DM',
'A63'='500 <= ... < 1000 DM', 
'A64'='.. >= 1000 DM', 
'A65'='unknown/ no savings account',
'A71'='unemployed', 
'A72'='... < 1 year', 
'A73'='1 <= ... < 4 years', 
'A74'='4 <= ... < 7 years', 
'A75'='.. >= 7 years', 
'A91'='male : divorced/separated', 
'A92'='female : divorced/separated/married', 
'A93'='male : single', 
'A94'='male : married/widowed', 
'A95'='female : single', 
'A101'='none', 
'A102'='co-applicant',
'A103'='auarantor', 
'A121'='real estate',
'A122'='if not A121 : building society savings agreement/life insurance',
'A123'='if not A121/A122 : car or other, not in attribute 6',
'A124'='unknown / no property', 
'A141'='bank', 
'A142'='stores', 
'A143'='none', 
'A151'='rent',
'A152'='own', 
'A153'='for free',
'A171'='unemployed/ unskilled - non-resident', 
'A172'='unskilled - resident', 
'A173'='skilled employee / official', 
'A174'='management/ self=employed/hightly qualified employee/ officer', 
'A191'='none', 
'A192'='yes, registered under the customers name', 
'A201'='yes', 
'A202'='no')

#对每列的数据进行映射
for(i in 1:ncol(credit)) {
  if(class(credit[, i])=="character"){
    credit[, i] = as.factor(as.character(mapping[credit[, i]]))
  }
}

#3)对因变量进行因子转换
credit$Good.Loan <- as.factor(ifelse(credit$Good.Loan==1, "GoodLoan", "BadLoan"))

#查看清洗后的数据:查看各列的频数
table(credit$Status.of.existing.checking.account)
## 
## ... >=200 DM / salary assignment for at least 1 year 
##                                                   63 
##                                    0 <= ... < 200 DM 
##                                                  269 
##                                   no checking accout 
##                                                  394 
##                                                 NULL 
##                                                  274
table(credit$Savings.account.bonds)
## 
##               .. >= 1000 DM                ... < 100 DM 
##                          48                         603 
##         100 <= ... < 500 DM        500 <= ... < 1000 DM 
##                         103                          63 
## unknown/ no savings account 
##                         183
summary(credit$Duration.in.month)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     4.0    12.0    18.0    20.9    24.0    72.0
#3、构建测试集和训练集数据
library(caret)
set.seed(2000)
index <- createDataPartition(credit$Good.Loan, p=0.8, list=F)
train_credit <- credit[index, ]
test_credit <- credit[-index, ]
prop.table(table(train_credit$Good.Loan))
## 
##  BadLoan GoodLoan 
##      0.3      0.7
prop.table(table(test_credit$Good.Loan))
## 
##  BadLoan GoodLoan 
##      0.3      0.7
#从上可知:测试集和训练集数据的比率相同

#4、建模
library(C50)
model_credit <- C5.0(train_credit[, -21], train_credit[, 21])

#5、模型评估
model_credit
## 
## Call:
## C5.0.default(x = train_credit[, -21], y = train_credit[, 21])
## 
## Classification Tree
## Number of samples: 800 
## Number of predictors: 20 
## 
## Tree size: 60 
## 
## Non-standard options: attempt to group attributes
summary(model_credit)
## 
## Call:
## C5.0.default(x = train_credit[, -21], y = train_credit[, 21])
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Mon Feb 08 12:07:02 2016
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 800 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## Status.of.existing.checking.account in {... >=200 DM / salary assignment for at least 1 year,
## :                                       no checking accout}: GoodLoan (368/49)
## Status.of.existing.checking.account in {0 <= ... < 200 DM,NULL}:
## :...Duration.in.month <= 22: [S1]
##     Duration.in.month > 22:
##     :...Present.residence.since <= 1:
##         :...Housing = for free: GoodLoan (3)
##         :   Housing = rent: BadLoan (1)
##         :   Housing = own:
##         :   :...Status.of.existing.checking.account = NULL:
##         :       :...Credit.amount <= 1943: GoodLoan (2)
##         :       :   Credit.amount > 1943: BadLoan (6/1)
##         :       Status.of.existing.checking.account = 0 <= ... < 200 DM:
##         :       :...Telephone = none: GoodLoan (9)
##         :           Telephone = yes, registered under the customers name:
##         :           :...Credit.amount <= 3878: BadLoan (2)
##         :               Credit.amount > 3878: GoodLoan (2)
##         Present.residence.since > 1:
##         :...Savings.account.bonds = .. >= 1000 DM:
##             :...Other.debtors.guarantors = co-applicant: BadLoan (1)
##             :   Other.debtors.guarantors in {auarantor,none}: GoodLoan (4)
##             Savings.account.bonds = 500 <= ... < 1000 DM:
##             :...Other.installment.plans in {bank,none}: BadLoan (4)
##             :   Other.installment.plans = stores: GoodLoan (2)
##             Savings.account.bonds = unknown/ no savings account: [S2]
##             Savings.account.bonds = 100 <= ... < 500 DM:
##             :...Other.debtors.guarantors in {auarantor,
##             :   :                            co-applicant}: BadLoan (2)
##             :   Other.debtors.guarantors = none:
##             :   :...Duration.in.month > 54: BadLoan (3)
##             :       Duration.in.month <= 54: [S3]
##             Savings.account.bonds = ... < 100 DM:
##             :...Other.debtors.guarantors = auarantor: [S4]
##                 Other.debtors.guarantors in {co-applicant,none}:
##                 :...Duration.in.month > 45: BadLoan (23/1)
##                     Duration.in.month <= 45:
##                     :...Purpose in {car (new),domestic appliances,education,
##                         :           other,repairs,
##                         :           retraining}: BadLoan (20/3)
##                         Purpose = business: [S5]
##                         Purpose = car (used):
##                         :...Age.in.years <= 28: BadLoan (4)
##                         :   Age.in.years > 28: GoodLoan (11/1)
##                         Purpose = furniture/equipment: [S6]
##                         Purpose = radio/television: [S7]
## 
## SubTree [S1]
## 
## Credit.history = delay in paying off in the past: GoodLoan (13/1)
## Credit.history in {all credits at this bank paid back duly,
## :                  no credits taken/all credits paid back duly}: BadLoan (21/6)
## Credit.history = critical account/other credits existing (not at this bank):
## :...Other.installment.plans = stores: GoodLoan (1)
## :   Other.installment.plans = bank: [S8]
## :   Other.installment.plans = none:
## :   :...Other.debtors.guarantors = auarantor: GoodLoan (3)
## :       Other.debtors.guarantors = co-applicant: BadLoan (3/1)
## :       Other.debtors.guarantors = none:
## :       :...Purpose in {business,car (new),car (used),domestic appliances,
## :           :           furniture/equipment,other,radio/television,
## :           :           retraining}: GoodLoan (44/3)
## :           Purpose in {education,repairs}: BadLoan (3)
## Credit.history = existing credits paid back duly till now:
## :...Savings.account.bonds = .. >= 1000 DM: GoodLoan (8)
##     Savings.account.bonds in {... < 100 DM,100 <= ... < 500 DM,
##     :                         500 <= ... < 1000 DM,unknown/ no savings account}: [S9]
## 
## SubTree [S2]
## 
## Status.of.existing.checking.account = 0 <= ... < 200 DM: GoodLoan (13/1)
## Status.of.existing.checking.account = NULL: BadLoan (13/3)
## 
## SubTree [S3]
## 
## Telephone = yes, registered under the customers name: GoodLoan (6)
## Telephone = none: [S10]
## 
## SubTree [S4]
## 
## Status.of.existing.checking.account = 0 <= ... < 200 DM: BadLoan (2)
## Status.of.existing.checking.account = NULL: GoodLoan (6)
## 
## SubTree [S5]
## 
## Property in {if not A121 . building society savings agreement/life insurance,
## :            real estate,unknown / no property}: BadLoan (6)
## Property = if not A121/A122 . car or other, not in attribute 6: GoodLoan (2)
## 
## SubTree [S6]
## 
## Present.employment.since in {.. >= 7 years,... < 1 year,
## :                            4 <= ... < 7 years}: BadLoan (14)
## Present.employment.since in {1 <= ... < 4 years,unemployed}:
## :...Status.of.existing.checking.account = 0 <= ... < 200 DM: BadLoan (1)
##     Status.of.existing.checking.account = NULL: GoodLoan (3)
## 
## SubTree [S7]
## 
## Status.of.existing.checking.account = 0 <= ... < 200 DM: GoodLoan (4)
## Status.of.existing.checking.account = NULL:
## :...Present.employment.since = .. >= 7 years: GoodLoan (2)
##     Present.employment.since in {... < 1 year,4 <= ... < 7 years,
##     :                            unemployed}: BadLoan (6)
##     Present.employment.since = 1 <= ... < 4 years:
##     :...Credit.amount <= 2835: BadLoan (2)
##         Credit.amount > 2835: GoodLoan (2)
## 
## SubTree [S8]
## 
## Job in {management/ self=employed/hightly qualified employee/ officer,
## :       unemployed/ unskilled - non-resident}: BadLoan (0)
## Job = unskilled - resident: GoodLoan (3)
## Job = skilled employee / official:
## :...Purpose = business: GoodLoan (1)
##     Purpose in {car (new),car (used),domestic appliances,education,
##                 furniture/equipment,other,radio/television,repairs,
##                 retraining}: BadLoan (5)
## 
## SubTree [S9]
## 
## Property = real estate: GoodLoan (57/14)
## Property = unknown / no property:
## :...Age.in.years <= 56: BadLoan (5)
## :   Age.in.years > 56: GoodLoan (3/1)
## Property = if not A121 . building society savings agreement/life insurance:
## :...Number.of.existing.credits.at.this.bank > 1: GoodLoan (2)
## :   Number.of.existing.credits.at.this.bank <= 1: [S11]
## Property = if not A121/A122 . car or other, not in attribute 6:
## :...Credit.amount <= 1386:
##     :...Present.employment.since = .. >= 7 years: GoodLoan (2)
##     :   Present.employment.since in {... < 1 year,1 <= ... < 4 years,
##     :                                4 <= ... < 7 years,
##     :                                unemployed}: BadLoan (13)
##     Credit.amount > 1386:
##     :...Other.debtors.guarantors = auarantor: GoodLoan (0)
##         Other.debtors.guarantors = co-applicant: BadLoan (1)
##         Other.debtors.guarantors = none:
##         :...Age.in.years > 22: GoodLoan (21/3)
##             Age.in.years <= 22: [S12]
## 
## SubTree [S10]
## 
## Job in {management/ self=employed/hightly qualified employee/ officer,
## :       unskilled - resident}: BadLoan (3)
## Job in {skilled employee / official,
##         unemployed/ unskilled - non-resident}: GoodLoan (8/2)
## 
## SubTree [S11]
## 
## Status.of.existing.checking.account = 0 <= ... < 200 DM: BadLoan (9/2)
## Status.of.existing.checking.account = NULL: GoodLoan (18/7)
## 
## SubTree [S12]
## 
## Installment.rate.in.percentage.of.disposable.income <= 1: GoodLoan (2)
## Installment.rate.in.percentage.of.disposable.income > 1: BadLoan (2)
## 
## 
## Evaluation on training data (800 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      58   99(12.4%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     158    82    (a): class BadLoan
##      17   543    (b): class GoodLoan
## 
## 
##  Attribute usage:
## 
##  100.00% Status.of.existing.checking.account
##   54.00% Duration.in.month
##   38.75% Savings.account.bonds
##   30.00% Credit.history
##   26.75% Other.debtors.guarantors
##   24.00% Present.residence.since
##   17.88% Property
##   16.25% Purpose
##    8.63% Other.installment.plans
##    7.13% Credit.amount
##    6.00% Age.in.years
##    5.63% Present.employment.since
##    3.75% Telephone
##    3.63% Number.of.existing.credits.at.this.bank
##    3.13% Housing
##    2.50% Job
##    0.50% Installment.rate.in.percentage.of.disposable.income
## 
## 
## Time: 0.0 secs
pred_credit <- predict(model_credit, train_credit)
mean(pred_credit==train_credit[, 21])
## [1] 0.87625
#6、预测
pred <- predict(model_credit, test_credit)

#7、查看预测数据
mean(pred==test_credit[, 21])
## [1] 0.745
table(pred, test_credit[, 21])
##           
## pred       BadLoan GoodLoan
##   BadLoan       26       17
##   GoodLoan      34      123
library(gmodels)
CrossTable(pred, test_credit[, 21])
## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## | Chi-square contribution |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  200 
## 
##  
##              | test_credit[, 21] 
##         pred |   BadLoan |  GoodLoan | Row Total | 
## -------------|-----------|-----------|-----------|
##      BadLoan |        26 |        17 |        43 | 
##              |    13.303 |     5.701 |           | 
##              |     0.605 |     0.395 |     0.215 | 
##              |     0.433 |     0.121 |           | 
##              |     0.130 |     0.085 |           | 
## -------------|-----------|-----------|-----------|
##     GoodLoan |        34 |       123 |       157 | 
##              |     3.644 |     1.562 |           | 
##              |     0.217 |     0.783 |     0.785 | 
##              |     0.567 |     0.879 |           | 
##              |     0.170 |     0.615 |           | 
## -------------|-----------|-----------|-----------|
## Column Total |        60 |       140 |       200 | 
##              |     0.300 |     0.700 |           | 
## -------------|-----------|-----------|-----------|
## 
## 
#从上面列联表可知:错误率为(34+17)/200=25.05%,同时有34/60=57%d bad loan我们归类为good loan。这结果显然不满足我们的要求。

#8、继续模型优化:C5.0中加入了adaptive boosting的支持
#1)使用迭代
model_boot <- C5.0(train_credit[, -21],  train_credit[, 21], trials = 10)
pred_boot <- predict(model_boot, test_credit[, -21])
table(pred_boot, test_credit[, 21])
##           
## pred_boot  BadLoan GoodLoan
##   BadLoan       28       19
##   GoodLoan      32      121
#从上可知:加入了boost后错误率没变,badloan误判为goodloan的概率为32/60=51%。然而我们知道如果把badloan 归类为goodloan的损失明显比把goodloan归类为badloan要大,因此这里加入一个cost矩阵,即为犯某类错误加上一个系数,比如如果我们按照cookbook里面的方法标记将badloan归类为goodloan的成本为5倍于goodloan归类为badloan。

#2)加入惩罚项
error_cost <- matrix(c(0, 1, 5, 0), nrow=2)
rownames(error_cost) <- c("GoodLoan", "BadLoan")
colnames(error_cost) <- c("GoodLoan", "BadLoan")
model_cost <- C5.0(train_credit[, -21], train_credit[, 21], costs=error_cost)
pred_cost <- predict(model_cost, test_credit)
table(pred_cost, test_credit[, 21])
##           
## pred_cost  BadLoan GoodLoan
##   BadLoan       51       75
##   GoodLoan       9       65

#从上可知:总错误率为84/200=42%,但是把badloan归类为goodloan的情况却大大减少了,只有9/60=15%。

#MarkDown脚本及数据集下载: http://pan.baidu.com/s/1dEx4vgX

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值