library(tree)
#读入数据
soybean = read.table("soybean-large.data", header=F, sep=",")
soybean_df = data.frame(soybean)
header = paste("att", 1:36, sep="")
names(soybean_df) = header
attach(soybean_df)
#用每列中的众数替换未知的值'?'
soybean_df[which(att1 == '?'), 1] = names(which.max(apply(soybean_df[,1:35], 2, table)$att1))
soybean_df[which(att2 == '?'), 2] = names(which.max(apply(soybean_df[,1:35], 2, table)$att2))
soybean_df[which(att3 == '?'), 3] = names(which.max(apply(soybean_df[,1:35], 2, table)$att3))
soybean_df[which(att4 == '?'), 4] = names(which.max(apply(soybean_df[,1:35], 2, table)$att4))
soybean_df[which(att5 == '?'), 5] = names(which.max(apply(soybean_df[,1:35], 2, table)$att5))
soybean_df[which(att6 == '?'), 6] = names(which.max(apply(soybean_df[,1:35], 2, table)$att6))
soybean_df[which(att7 == '?'), 7] = names(which.max(apply(soybean_df[,1:35], 2, table)$att7))
soybean_df[which(att8 == '?'), 8] = names(which.max(apply(soybean_df[,1:35], 2, table)$att8))
soybean_df[which(att9 == '?'), 9] = names(which.max(apply(soybean_df[,1:35], 2, table)$att9))
soybean_df[which(att10 == '?'), 10] = names(which.max(apply(soybean_df[,1:35], 2, table)$att10))
soybean_df[which(att11 == '?'), 11] = names(which.max(apply(soybean_df[,1:35], 2, table)$att11))
soybean_df[which(att12 == '?'), 12] = names(which.max(apply(soybean_df[,1:35], 2, table)$att12))
soybean_df[which(att13 == '?'), 13] = names(which.max(apply(soybean_df[,1:35], 2, table)$att13))
soybean_df[which(att14 == '?'), 14] = names(which.max(apply(soybean_df[,1:35], 2, table)$att14))
soybean_df[which(att15 == '?'), 15] = names(which.max(apply(soybean_df[,1:35], 2, table)$att15))
soybean_df[which(att16 == '?'), 16] = names(which.max(apply(soybean_df[,1:35], 2, table)$att16))
soybean_df[which(att17 == '?'), 17] = names(which.max(apply(soybean_df[,1:35], 2, table)$att17))
soybean_df[which(att18 == '?'), 18] = names(which.max(apply(soybean_df[,1:35], 2, table)$att18))
soybean_df[which(att19 == '?'), 19] = names(which.max(apply(soybean_df[,1:35], 2, table)$att19))
soybean_df[which(att20 == '?'), 20] = names(which.max(apply(soybean_df[,1:35], 2, table)$att20))
soybean_df[which(att21 == '?'), 21] = names(which.max(apply(soybean_df[,1:35], 2, table)$att21))
soybean_df[which(att22 == '?'), 22] = names(which.max(apply(soybean_df[,1:35], 2, table)$att22))
soybean_df[which(att23 == '?'), 23] = names(which.max(apply(soybean_df[,1:35], 2, table)$att23))
soybean_df[which(att24 == '?'), 24] = names(which.max(apply(soybean_df[,1:35], 2, table)$att24))
soybean_df[which(att25 == '?'), 25] = names(which.max(apply(soybean_df[,1:35], 2, table)$att25))
soybean_df[which(att26 == '?'), 26] = names(which.max(apply(soybean_df[,1:35], 2, table)$att26))
soybean_df[which(att27 == '?'), 27] = names(which.max(apply(soybean_df[,1:35], 2, table)$att27))
soybean_df[which(att28 == '?'), 28] = names(which.max(apply(soybean_df[,1:35], 2, table)$att28))
soybean_df[which(att29 == '?'), 29] = names(which.max(apply(soybean_df[,1:35], 2, table)$att29))
soybean_df[which(att30 == '?'), 30] = names(which.max(apply(soybean_df[,1:35], 2, table)$att30))
soybean_df[which(att31 == '?'), 31] = names(which.max(apply(soybean_df[,1:35], 2, table)$att31))
soybean_df[which(att28 == '?'), 32] = names(which.max(apply(soybean_df[,1:35], 2, table)$att32))
soybean_df[which(att29 == '?'), 33] = names(which.max(apply(soybean_df[,1:35], 2, table)$att33))
soybean_df[which(att30 == '?'), 34] = names(which.max(apply(soybean_df[,1:35], 2, table)$att34))
soybean_df[which(att31 == '?'), 35] = names(which.max(apply(soybean_df[,1:35], 2, table)$att35))
#首先使用将数据集分按照7:3为训练集和测试集
train = sample(1:nrow(soybean_df), 210)
test = soybean_df[-train,]
#使用分类决策树产生决策树模型
soybean.tree = rpart(att36~., data=soybean_df[train,])
plot(soybean.tree, margin=0.1)
text(soybean.tree, cex=0.5)
#使用决策树模型进行预测
soybean.pred = predict(soybean.tree, test, type="class")
att36.test = test[,36]
table(soybean.pred, att36.test)
#计算混淆矩阵的命中率
matrix_pred = matrix(table(soybean.pred, att36.test), ncol=19, nrow=19)
sum(diag(matrix_pred))/nrow(test)
[1] 0.6391753
#读入数据
soybean = read.table("soybean-large.data", header=F, sep=",")
soybean_df = data.frame(soybean)
header = paste("att", 1:36, sep="")
names(soybean_df) = header
attach(soybean_df)
#用每列中的众数替换未知的值'?'
soybean_df[which(att1 == '?'), 1] = names(which.max(apply(soybean_df[,1:35], 2, table)$att1))
soybean_df[which(att2 == '?'), 2] = names(which.max(apply(soybean_df[,1:35], 2, table)$att2))
soybean_df[which(att3 == '?'), 3] = names(which.max(apply(soybean_df[,1:35], 2, table)$att3))
soybean_df[which(att4 == '?'), 4] = names(which.max(apply(soybean_df[,1:35], 2, table)$att4))
soybean_df[which(att5 == '?'), 5] = names(which.max(apply(soybean_df[,1:35], 2, table)$att5))
soybean_df[which(att6 == '?'), 6] = names(which.max(apply(soybean_df[,1:35], 2, table)$att6))
soybean_df[which(att7 == '?'), 7] = names(which.max(apply(soybean_df[,1:35], 2, table)$att7))
soybean_df[which(att8 == '?'), 8] = names(which.max(apply(soybean_df[,1:35], 2, table)$att8))
soybean_df[which(att9 == '?'), 9] = names(which.max(apply(soybean_df[,1:35], 2, table)$att9))
soybean_df[which(att10 == '?'), 10] = names(which.max(apply(soybean_df[,1:35], 2, table)$att10))
soybean_df[which(att11 == '?'), 11] = names(which.max(apply(soybean_df[,1:35], 2, table)$att11))
soybean_df[which(att12 == '?'), 12] = names(which.max(apply(soybean_df[,1:35], 2, table)$att12))
soybean_df[which(att13 == '?'), 13] = names(which.max(apply(soybean_df[,1:35], 2, table)$att13))
soybean_df[which(att14 == '?'), 14] = names(which.max(apply(soybean_df[,1:35], 2, table)$att14))
soybean_df[which(att15 == '?'), 15] = names(which.max(apply(soybean_df[,1:35], 2, table)$att15))
soybean_df[which(att16 == '?'), 16] = names(which.max(apply(soybean_df[,1:35], 2, table)$att16))
soybean_df[which(att17 == '?'), 17] = names(which.max(apply(soybean_df[,1:35], 2, table)$att17))
soybean_df[which(att18 == '?'), 18] = names(which.max(apply(soybean_df[,1:35], 2, table)$att18))
soybean_df[which(att19 == '?'), 19] = names(which.max(apply(soybean_df[,1:35], 2, table)$att19))
soybean_df[which(att20 == '?'), 20] = names(which.max(apply(soybean_df[,1:35], 2, table)$att20))
soybean_df[which(att21 == '?'), 21] = names(which.max(apply(soybean_df[,1:35], 2, table)$att21))
soybean_df[which(att22 == '?'), 22] = names(which.max(apply(soybean_df[,1:35], 2, table)$att22))
soybean_df[which(att23 == '?'), 23] = names(which.max(apply(soybean_df[,1:35], 2, table)$att23))
soybean_df[which(att24 == '?'), 24] = names(which.max(apply(soybean_df[,1:35], 2, table)$att24))
soybean_df[which(att25 == '?'), 25] = names(which.max(apply(soybean_df[,1:35], 2, table)$att25))
soybean_df[which(att26 == '?'), 26] = names(which.max(apply(soybean_df[,1:35], 2, table)$att26))
soybean_df[which(att27 == '?'), 27] = names(which.max(apply(soybean_df[,1:35], 2, table)$att27))
soybean_df[which(att28 == '?'), 28] = names(which.max(apply(soybean_df[,1:35], 2, table)$att28))
soybean_df[which(att29 == '?'), 29] = names(which.max(apply(soybean_df[,1:35], 2, table)$att29))
soybean_df[which(att30 == '?'), 30] = names(which.max(apply(soybean_df[,1:35], 2, table)$att30))
soybean_df[which(att31 == '?'), 31] = names(which.max(apply(soybean_df[,1:35], 2, table)$att31))
soybean_df[which(att28 == '?'), 32] = names(which.max(apply(soybean_df[,1:35], 2, table)$att32))
soybean_df[which(att29 == '?'), 33] = names(which.max(apply(soybean_df[,1:35], 2, table)$att33))
soybean_df[which(att30 == '?'), 34] = names(which.max(apply(soybean_df[,1:35], 2, table)$att34))
soybean_df[which(att31 == '?'), 35] = names(which.max(apply(soybean_df[,1:35], 2, table)$att35))
#首先使用将数据集分按照7:3为训练集和测试集
train = sample(1:nrow(soybean_df), 210)
test = soybean_df[-train,]
#使用分类决策树产生决策树模型
soybean.tree = rpart(att36~., data=soybean_df[train,])
plot(soybean.tree, margin=0.1)
text(soybean.tree, cex=0.5)
#使用决策树模型进行预测
soybean.pred = predict(soybean.tree, test, type="class")
att36.test = test[,36]
table(soybean.pred, att36.test)
#计算混淆矩阵的命中率
matrix_pred = matrix(table(soybean.pred, att36.test), ncol=19, nrow=19)
sum(diag(matrix_pred))/nrow(test)
[1] 0.6391753