#----------------------------------------
# 功能描述:演示C50建模过程
# 数据集:汉堡大学信贷模型,信贷数据
#
#----------------------------------------
#第一步:收集数据
# import the CSV file
credit
# 检查数据
table(credit$checking_balance)
table(credit$savings_balance)
# 五数分析法
summary(credit$months_loan_duration)
summary(credit$amount)
# 查看分类变量
table(credit$default)
# 利用随机数来获取训练数据和测试数据,如果需要重复这里的分析,可以使用随机种子set.seed
set.seed(12345)
credit_rand
# 比较数据集
summary(credit$amount)
summary(credit_rand$amount)
head(credit$amount)
head(credit_rand$amount)
# 分割数据集
credit_train
credit_test
# 查看分类变量的占比
prop.table(table(credit_train$default))
prop.table(table(credit_test$default))
## 第三步: 训练模型
library(C50)
#---------------------------------------------
# 创建分类器:
# m
# train: 一个包含训练数据的数据框
# class: 包含训练数据每一行的分类的一个因子向量
# trials: 为一个可选数值,用于控制自助法循环的次数(默认为1)
# costs: 为一个可选矩阵,用于给出与各种类型错误相对应的成本
# 该函数返回一个C5.0模型对象,该对象可用于预测
#
# 进行预测:
# p
# m: 由C5.0(train, class, trials = 1, costs = NULL) 训练的一个模型
# test: 一个包含测试数据的数据框,该数据框和用来创建分类器的训练数据有相同的特征
# type: 取值为“”或者“”标示预测是最可能的类别值或者是原始的预测概率
# 该函数返回一个向量,根据参数type的取值,该向量含有预测的类别值或者原始的预测概率
#
# example:
# credit_model
# credit_prediction
#----------------------------------------------
# 构建决策数据模型
credit_model
# 显示决策树模型
credit_model
# 显示模型详细信息
summary(credit_model)
## 第四步: 评估模型性能
# create a factor vector of predictions on test data
credit_pred
# cross tabulation of predicted versus actual classes
library(gmodels)
CrossTable(credit_test$default, credit_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))
## 第五步: 提升模型性能
## Boosting the accuracy of decision trees
# boosted decision tree with 10 trials
credit_boost10
trials = 10)
credit_boost10
summary(credit_boost10)
credit_boost_pred10
CrossTable(credit_test$default, credit_boost_pred10,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))
# boosted decision tree with 100 trials (not shown in text)
credit_boost100
trials = 100)
credit_boost_pred100
CrossTable(credit_test$default, credit_boost_pred100,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))
## Making some mistakes more costly than others
# create a cost matrix
error_cost
error_cost
# apply the cost matrix to the tree
credit_cost
costs = error_cost)
credit_cost_pred
CrossTable(credit_test$default, credit_cost_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))
#### Part 2: Rule Learners -------------------
## Example: Identifying Poisonous Mushrooms ----
## Step 2: Exploring and preparing the data ----
mushrooms
# examine the structure of the data frame
str(mushrooms)
# drop the veil_type feature
mushrooms$veil_type
# examine the class distribution
table(mushrooms$type)
## Step 3: Training a model on the data ----
library(RWeka)
# train OneR() on the data
mushroom_1R
## Step 4: Evaluating model performance ----
mushroom_1R
summary(mushroom_1R)
## Step 5: Improving model performance ----
mushroom_JRip
mushroom_JRip
summary(mushroom_JRip)
# Rule Learner Using C5.0 Decision Trees (not in text)
library(C50)
mushroom_c5rules
summary(mushroom_c5rules)