本文辑录了《R语言实战——机器学习与数据分析》(电子工业出版社2016年出版)一书第12章至第15章之代码,主要包括EM、支持向量机和人工神经网络等内容。本书引言请见如下链接:
http://blog.csdn.net/baimafujinji/article/details/51596171
内容简介:本书系统地介绍了统计分析和机器学习领域中最为重要和流行的多种技术及它们的基本原理,在详解有关算法的基础上,结合大量R语言实例演示了这些理论在实践中的使用方法。具体内容被分成三个部分,即R语言编程基础、基于统计的数据分析方法以及机器学习理论。统计分析与机器学习部分又具体介绍了包括参数估计、假设检验、极大似然估计、非参数检验方法(包括列联分析、符号检验、符号秩检验等)、方差分析、线性回归(包括岭回归和Lasso方法)、逻辑回归、支持向量机、聚类分析(包括K均值算法和EM算法)和人工神经网络等内容。同时,统计理论的介绍也为深化读者对于后续机器学习部分的理解提供了很大助益。知识结构和阅读进度的安排上既兼顾了循序渐进的学习规律,亦统筹考虑了夯实基础的必要性
网上书店地址:
电子工业出版社官网
中国互动出版网China-pub
京东商城(1)
京东商城(2)
Chapter 12
P279~280
qf(0.05 , 2, 15, lower.tail = FALSE)
pf(4.698, 2, 15, lower.tail = FALSE)
X <- c(4.2, 3.3, 3.7, 4.3, 4.1, 3.3,
+ 4.5, 4.4, 3.5, 4.2, 4.6, 4.2,
+ 5.6, 3.6, 4.5, 5.1, 4.9, 4.7)
A <- factor(rep(1:3, each=6))
my.data <- data.frame(X, A)
my.aov <- aov(X~A, data = my.data)
summary(my.aov)
P285
qf(0.05, 2, 10, lower.tail = FALSE)
pf(4.24, 2, 10, lower.tail = FALSE)
qf(0.05, 5, 10, lower.tail = FALSE)
pf(29.49, 5, 10, lower.tail = FALSE)
x <- c(64, 65, 73, 53, 54, 59, 71, 68, 79,
+ 41, 46, 38, 50, 58, 65, 42, 40, 46)
my.data <- data.frame(x, A = gl(6, 3), B = gl(3, 1, 18))
my.aov <- aov(x ~ A+B, data = my.data)
summary(my.aov)
P286~287
pancakes <- data.frame(supp = rep(c("no supplement", "supplement"),
+ each = 12), whey = rep(rep(c("0%", "10%", "20%", "30%"),
+ each = 3), 2), quality = c(4.4, 4.5, 4.3, 4.6, 4.5, 4.8,
+ 4.5, 4.8, 4.8, 4.6, 4.7, 5.1, 3.3, 3.2, 3.1, 3.8, 3.7, 3.6,
+ 5, 5.3, 4.8, 5.4, 5.6, 5.3))
round(tapply(pancakes$quality, pancakes[, 1:2], mean), 2)
library(stats)
interaction.plot(pancakes$whey, pancakes$supp, pancakes$quality)
P289
pancakes.lm <- lm(quality ~ supp * whey, data = pancakes)
anova(pancakes.lm)
my.aov <- aov(quality ~ supp * whey, data = pancakes)
summary(my.aov)
P290~291
p.adjust.methods
pairwise.t.test(X, A, p.adjust.method = "bonferroni")
P292~293
x <- c(4.2, 3.3, 3.7, 4.3, 4.1, 3.3,
+ 4.5, 4.4, 3.5, 4.2, 4.6, 4.2,
+ 5.6, 3.6, 4.5, 5.1, 4.9, 4.7)
group <- factor(rep(LETTERS[1:3], each = 6));
mice <- data.frame(x, group)
mice.aov <- aov(x ~ group, data = mice)
summary(mice.aov)
library(multcomp)
mice.Dunnett <- glht(mice.aov, linfct=mcp(group = "Dunnett"))
summary(mice.Dunnett)
P294
windows(width=5,height=3,pointsize=10)
plot(mice.Dunnett,sub="Mice Data")
mtext("Dunnet's Method",side=3,line=0.5)
qtukey(0.05, 3, 15, lower.tail = F)
P296~297
posthoc <- TukeyHSD(mice.aov, 'group')
posthoc
ptukey(1.96785, 3, 15, lower.tail = F)
ptukey(4.32925, 3, 15, lower.tail = F)
ptukey(2.36140, 3, 15, lower.tail = F)
library(agricolae)
comparison <- HSD.test(mice.aov, 'group', console = T)
P298~299
print(comparison$groups)
library(agricolae)
comparison <- SNK.test(mice.aov, "group", console = T)
P301
ptukey(1.968, 2, 15, lower.tail = F)
ptukey(4.329, 3, 15, lower.tail = F)
ptukey(2.361, 2, 15, lower.tail = F)
P303
pchisq(1.4947, 2, lower.tail = F)
bartlett.test(X ~ A, data = my.data)
P305~306
X <- c(0.383, 0.517, 0.117, 0.483, 0.283, 0.517,
+ 0.267, 0.167, 0.733, 0.033, 0.367, 0.033,
+ 0.867, 1.133, 0.233, 0.367, 0.167, 0.033)
A <- factor(rep(1:3, each=6))
my.data <- data.frame(X, A)
my.aov <- aov(X~A, data = my.data)
summary(my.aov)
library(car)
leveneTest(X ~ A, data = my.data)
leveneTest(X ~ A, data = my.data, center = mean)
library(lawstat)
levene.test(X, A, location="median")
levene.test(X, A, location="mean")
Chapter 13
P313
countries = read.csv("c:/countries_data.csv")
head(countries)
var = as.character(countries$countries)
for(i in 1:30) dimnames(countries)[[1]][i] = var[i]
countries = countries[,2:3]
names(countries) = c("Services(%)", "Aged_Population(%)")
head(countries)
my.km <- kmeans(countries, center = 2)
my.km$center
head(my.km$cluster)
plot(countries, col = my.km$cluster)
points(my.km$centers, col = 1:2, pch = 8, cex = 2)
P323~324
my.em <- Mclust(countries)
summary(my.em)
summary(my.em, parameters = TRUE)
mclust2Dplot(countries, parameters = my.em$parameters,
+ z = my.em$z, what = "classification", main = TRUE)
P325
model_density <- densityMclust(countries)
plot(model_density, countries, col = "cadetblue",
nlevels = 25, what = "density")
plot(model_density, what = "density", type = "persp", theta = 235)
Chapter 14
P351~352
library(lattice)
xyplot(Petal.Length ~ Petal.Width, data = iris, groups = Species,
+ auto.key=list(corner=c(1,0)))
data(iris)
attach(iris)
subdata <- iris[iris$Species != 'virginica',]
subdata$Species <- factor(subdata$Species)
model1 <- svm(Species ~ Petal.Length + Petal.Width, data = subdata)
P353~354
plot(model1, subdata, Petal.Length ~ Petal.Width)
model2 <- svm(Species ~ ., data = iris)
summary(model2)
x = iris[, -5] #提取iris 数据中除第5 列以外的数据作为特征变量
y = iris[, 5] #提取iris 数据中的第5 列数据作为结果变量
model3 = svm(x, y, kernel = "radial",
+ gamma = if (is.vector(x)) 1 else 1 / ncol(x))
P355~356
pred <- predict(model3, x)
table(pred, y)
pred <- predict(model3, x, decision.values = TRUE)
attr(pred, "decision.values")[1:4,]
pred[77:78]
plot(cmdscale(dist(iris[,-5])),
+ col = c("orange","blue","green")[as.integer(iris[,5])],
+ pch = c("o","+")[1:150 %in% model3$index + 1])
legend(1.8, -0.8, c("setosa","versicolor","virgincia"),
+ col = c("orange","blue","green"), lty = 1)
Chapter 15
P372~373
samp <- c(sample(1:50,25), sample(51:100,25), sample(101:150,25))
ird <- data.frame(rbind(iris3[,,1], iris3[,,2], iris3[,,3]),
+ species = factor(c(rep("s",50), rep("c", 50), rep("v", 50))))
ir.nn1 <- nnet(species ~ ., data = ird, subset = samp, size = 2,
+ rang = 0.1, decay = 5e-4, maxit = 200)
targets <- class.ind( c(rep("s", 50), rep("c", 50), rep("v", 50)))
ir <- rbind(iris3[,,1],iris3[,,2],iris3[,,3])
ir.nn2 <- nnet(ir[samp,], targets[samp,], size = 2, rang = 0.1,
+ decay = 5e-4, maxit = 200)
summary(ir.nn1)
table(ird$species[-samp], predict(ir.nn1, ird[-samp,], type = "class"))
P374
pre.matrix <- function(true, pred) {
+ name = c("c","s","v")
+ true <- name[max.col(true)]
+ cres <- name[max.col(pred)]
+ table(true, cres)
+ }
pre.matrix(targets[-samp,], predict(ir.nn2, ir[-samp,]))