R语言实战：机器学习与数据分析源代码6（最终弹）-CSDN博客

本文链接：https://blog.csdn.net/baimafujinji/article/details/51751954

本文辑录了《R语言实战——机器学习与数据分析》（电子工业出版社2016年出版）一书第7章后半部分（137页~145页）至第8章之代码。本书引言请见如下链接：
http://blog.csdn.net/baimafujinji/article/details/51596171

内容简介：本书系统地介绍了统计分析和机器学习领域中最为重要和流行的多种技术及它们的基本原理，在详解有关算法的基础上，结合大量R语言实例演示了这些理论在实践中的使用方法。具体内容被分成三个部分，即R语言编程基础、基于统计的数据分析方法以及机器学习理论。统计分析与机器学习部分又具体介绍了包括参数估计、假设检验、极大似然估计、非参数检验方法（包括列联分析、符号检验、符号秩检验等）、方差分析、线性回归（包括岭回归和Lasso方法）、逻辑回归、支持向量机、聚类分析（包括K均值算法和EM算法）和人工神经网络等内容。同时，统计理论的介绍也为深化读者对于后续机器学习部分的理解提供了很大助益。知识结构和阅读进度的安排上既兼顾了循序渐进的学习规律，亦统筹考虑了夯实基础的必要性

网上书店地址：

电子工业出版社官网
 中国互动出版网China-pub
京东商城（1）
京东商城（2）

Chapter 7 (From P137)

P137

assessment <- c("weak","good","limited","fair")
assessment1 <- factor(assessment)
assessment1

str(assessment1)

assessment1 <- factor(assessment, order=TRUE,
+ levels=c("good","fair","limited","weak"))

assessment1
str(assessment1)

sample <- c(12,15,7,10)
fsample <- factor(sample,levels=c(7,10,12,15,100))
fsample

length(fsample)

P138~139


fsample[5]<-100
fsample

fsample[6]<-99

wt <- c(46,39,35,42,43,43)
group <- c("A","B","C","A","B","C")
tapply(wt,as.factor(group),mean)

wt <- c(46,39,35,42,43,43,42,44,36,40,39,38)
diet <- c("A","B","C","A","B","C","A","B","C","A","B","C")
gender <- c("M","M","M","M","M","M","F","F","F","F","F","F")
tapply(wt,list(as.factor(diet),as.factor(gender)),mean)

split(wt,list(diet,gender))

P140~141

myopia

by(myopia,myopia$degree,function(frame) frame[,2]+frame[,3])

diet
gender
wt
table(list(diet,gender))

artery <- read.csv("C:/data/graft_arteries.csv")
artery

P142

table(list(artery$Diabetes,artery$Hypertension))

table(artery$Diabetes)

table(D=artery$Diabetes,H=artery$Hypertension,S=artery$Ever_smoked)

P143~144

dh_tab <- table(list(D=artery$Diabetes,H=artery$Hypertension))
dh_tab

dh_tab[1,1]
dh_tab[1,]
dh_tab[,2]

dh_tab[2,2]*4
dh_tab[1,]*2
dh_tab/3

dh_tab

apply(dh_tab,1,sum)
apply(dh_tab,2,sum)

addmargins(dh_tab)

dhs_tab <- table(D=artery$Diabetes,
+ H=artery$Hypertension,S=artery$Ever_smoked)
dhs_tab

addmargins(dhs_tab)

P145


apply(dhs_tab,"S",sum)
apply(dhs_tab,"D",sum)
apply(dhs_tab,"H",sum)

Chapter 8

P147~148

57/200745

pnorm(1)-pnorm(-1)
pnorm(2)-pnorm(-2)
pnorm(3)-pnorm(-3)

P150~151

n <- 200745
(p.hat <- 57/n)
p.hat + c(-1.96, 1.96) * sqrt(p.hat * (1 - p.hat)/n)

binom.test(57,200745)

conf.int<-function(x,n,sigma,alpha){
options(digits=5)
mean<-mean(x)
c(mean-sigma*qnorm(1-alpha/2,mean=0, sd=1,
lower.tail = TRUE)/sqrt(n),
mean+sigma*qnorm(1-alpha/2,mean=0, sd=1,
lower.tail = TRUE)/sqrt(n))
}

x<-c(112.5, 101.0, 103.0, 102.0, 100.5,
+ 102.6, 107.5, 95.00, 108.8, 115.6,
+ 100.0, 123.5, 102.0, 101.6, 102.2,
+ 116.6, 95.40, 97.80, 108.6, 105.0,
+ 136.8, 102.8, 101.5, 98.40, 93.30)

n <- 25
alpha <- 0.05
sigma <- 10
result <- conf.int(x, n, sigma, alpha)
result

P152~153

curve(dnorm(x), from = -5, to = 5, ylim = c(0, 0.45),
+ ylab ="", col = "blue")
par(new=TRUE)
curve(dt(x, 1), from = -5, to = 5, ylim = c(0, 0.45),
+ ylab ="", lty = 2, col = "red")
par(new=TRUE)
curve(dt(x, 3), from = -5, to = 5, ylim = c(0, 0.45),
+ ylab ="", lty = 3)
text.legend = c("dnorm","dt(1)", "dt(3)")
legend("topright", legend = text.legend, lty=c(1,2,3),
+ col = c("blue", "red", "black"))

pH <- c(6, 5.7, 6.2, 6.3, 6.5, 6.4, 6.9, 6.6,
+ 6.8, 6.7, 6.8, 7.1, 6.8, 7.1, 7.1, 7.5, 7)
mean(pH); sd(pH)
mean(pH)+ qt(c(0.025,0.975),length(pH)-1)*sd(pH)/sqrt(length(pH))

t.test(pH, mu=7)

P155

chisq.var.test <- function (x, alpha){
options(digits=4)
result<-list( )
n<-length(x)
v<-var(x)
result$conf.int.var <- c(
(n-1)*v/qchisq(alpha/2, df=n-1, lower.tail=F),
(n-1)*v/qchisq(alpha/2, df=n-1, lower.tail=T))
result$conf.int.se <- sqrt(result$conf.int.var)
result
}

chisq.var.test(x, 0.05)

P157

chicks <- data.frame(feed = rep(c(1,2), times=c(3,6)),
+ weight_gain = c(
+ 42, 68, 85,
+ 42, 97, 81, 95, 61, 103))
tapply(chicks$weight_gain, chicks$feed, mean)
tapply(chicks$weight_gain, chicks$feed, sd)

t.test(weight_gain ~ feed, data = chicks, var.equal = TRUE)

P159~160

t.test(weight_gain ~ feed, data = chicks)

Feed.1 <- c(44, 55, 68, 85, 90, 97)
Feed.2 <- c(42, 61, 81, 95, 97, 103)
t.test(Feed.2, Feed.1, paired = T)

diff = Feed.2-Feed.1
t.test(diff)

P161~164

Feed <- c(Feed.1, Feed.2)
group <- c(rep(1, 6), rep(2, 6))
t.test(Feed ~ group)

prop.test(x=c(225,128),n=c(500,400), correct=F)
prop.test(x=c(225,128),n=c(500,400))

1 - pbinom(5, size = 8, prob = 0.5)

pbinom(8, 100, 0.26)

P165~166

prop.test(8,100,p=0.26,alternative="less")

binom.test(8,100,p=0.26,alternative="less")

P168~169

2*pt(-2.9326, 16, lower.tail = T)

qt(0.025, 16); qt(0.975, 16)

pt(-2.9326, 16)

t.test(pH, mu = 7, alternative = "less")

P170~172

qt(0.025, 7); qt(0.975, 7)

pt(-0.9019, 7, lower.tail = T)*2

qt(0.025, 4.503); qt(0.975, 4.503)

pt(-0.9357, 4.503, lower.tail = T)*2

qt(0.025, 5); qt(0.975, 5)

2*(pt(3.2359, 5, lower.tail = F))

P177~178

f <- function(lamda){
logL = n*log(lamda) - lamda*sum(x)
return (logL)
}

x = c(518,612,713,388,434)
n = length(x)
duration <- optimize(f, c(0,1), maximum = TRUE)
duration

1/duration$maximum

library(MASS)
attach(geyser)

hist(waiting, freq = FALSE, col = "wheat")
lines(density(waiting), col = 'red', lwd = 2)

P179

LL<-function(params,data){
t1<-suppressWarnings(dnorm(data,params[2],params[3]))
t2<-suppressWarnings(dnorm(data,params[4],params[5]))
ll<-sum(log(params[1]*t1+(1-params[1])*t2))
return(ll)
}

library("maxLik")
mle <- maxLik(logLik = LL, start = c(0.5,50,10,80,10), data=waiting)
mle

a <- mle$estimate[1]
mu1<-mle$estimate[2]; s1<- mle$estimate[3]
mu2<-mle$estimate[4]; s2<- mle$estimate[5]
X<-seq(40,120,length=100)
f<-a*dnorm(X,mu1,s1)+(1-a)*dnorm(X,mu2,s2)
hist(waiting, freq = FALSE, col = “wheat”)
lines(density(waiting), col = ‘red’, lty = 2)
lines(X, f, col = “blue”)
text.legend = c(“Density Line”,”Max Likelihood”)
legend(“topright”, legend = text.legend, lty=c(2,1),
+ col = c(“red”,”blue”))