(a) Continuous Normal regression, ANOVA or ANCOVA
(b) Proportion Logistic regression
(c) Count Log-linear models
(d) Binary Binary logistic analysis
(e) Time at death Survival analysis
The explanatory variables
(a) All explanatory variables continuous Regression
(b) All explanatory variables categorical Analysis of variance (ANOVA)
(c) Explanatory variables both continuous and categorical Analysis of covariance (ANCOVA)
library(MASS)
windows(7,7)
boxcox(volume~log(girth)+log(height))
boxcox(volume~log(girth)+log(height),lambda=seq(-0.5,0.5,0.01))
boxcox(volume~girth+height)
boxcox(volume~girth+height,lambda=seq(0.1,0.6,0.01))
9.13 Model checking
For instance, we should routinely plot the residuals against:
the fitted values (to look for heteroscedasticity);
the explanatory variables (to look for evidence of curvature);
the sequence of data collection (to look for temporal correlation);
standard normal deviates (to look for non-normality of errors)
The idea is to see what patterns are generated in normal plots by the different kinds
of non-normality. In real applications we would use the generic plot(model) rather than mcheck (see
below). First, we write the function mcheck. The idea is to produce two plots, side by side: a plot of the
residuals against the fitted values on the left, and a plot of the ordered residuals against the quantiles of the
normal distribution on the right
mcheck <- function (obj, ...){
rs <- obj$resid
fv <- obj$fitted
windows(7,4)
par(mfrow=c(1,2))
plot(fv,rs,xlab="Fitted values",ylab="Residuals",pch=16,col="red")
abline(h=0, lty=2)
qqnorm(rs,xlab="Normal scores",ylab="Ordered residuals",main="",pch=16)
qqline(rs,lty=2,col="green")
par(mfrow=c(1,1))
invisible(NULL) }
x <- 0:30
e <- rnorm(31,mean=0,sd=5)
yn <- 10+x+e
mn <- lm(yn~x)
mcheck(mn)
eu <- 20*(runif(31)-0.5)
yu <- 10+x+eu
mu <- lm(yu~x)
mcheck(mu)
enb <- rnbinom(31,2,.3)
ynb <- 10+x+enb
mnb <- lm(ynb~x)
mcheck(mnb)
enb <- rnbinom(31,2,.3)
ynb <- 10+x+enb
mnb <- lm(ynb~x)
mcheck(mnb)
eg <- rgamma(31,1,1/x)
yg <- 10+x+eg
mg <- lm(yg~x)
mcheck(mg)
x <- c(2,3,3,3,4)
y <- c(2,3,2,1,2)
windows(7,4)
par(mfrow=c(1,2))
plot(x,y,xlim=c(0,8),ylim=c(0,8))
x1 <- c(x,7)
y1 <- c(y,6)
plot(x1,y1,xlim=c(0,8),ylim=c(0,8))
abline(lm(y1~x1),col="blue")
reg <- lm(y1~x1)
summary(reg)
influence.measures(reg)
influence.measures(reg)$is.inf
lm.influence(reg)
summary.aov(lm(y1[-6]~x1[-6]))
data <- read.table("c:\\temp\\ipomopsis.txt",header=T)
attach(data)
names(data)
model <- lm(Fruit[Grazing=="Grazed"]~Root[Grazing=="Grazed"])
model <- lm(Fruit~Root,subset=(Grazing=="Grazed"))
weights = rep(1, n.observations)
model <- lm(Fruit~Grazing,weights=Root)
summary(model)
model <- lm(Fruit~Grazing)
summary(model)
Root[37] <- NA
model <- lm(Fruit~Grazing*Root)
model <- lm(Fruit~Grazing*Root,na.action=na.fail)
leverage <- function(x){1/length(x)+(x-mean(x))ˆ2/sum((x-mean(x))ˆ2)}
plot(leverage(x),type="h",ylim=c(0,1),col="blue")
abline(h=4/6,lty=2,col="green")
Decay <- read.table("c:\\temp\\Decay.txt",header=T)
attach(Decay)
names(Decay)
model <- lm(amount~time)
par(mfrow=c(2,2))
plot(model)
data <- read.table("c:\\temp\\regression.txt",header=T)
attach(data)
names(data)
model <- lm(growth~tannin)
summary(model)
coef(model)
fitted(model)
resid(model)
vcov(model)
x <- 0:100
y <- 17+0.2*x+3*rnorm(101)
model0 <- lm(y~1)
model1 <- lm(y~x)
model2 <- lm(y~x+I(xˆ2))
models <- list(model0,model1,model2)
lapply(models,coef)
as.vector(unlist(lapply(models,coef)))[c(1,2,4)]
lapply(models,AIC)
Aliasing
data <- read.table("c:\\temp\\poly.txt",header=T)
attach(data)
names(data)
tapply(response,treatment,mean)
summary.aov(model)
is.factor(treatment)
is.ordered(treatment)
treatment <- ordered(treatment,levels=c("none","low","medium","high"))
levels(treatment)
model2 <- lm(response~treatment)
summary.aov(model2)
summary.lm(model2)
tapply(response,treatment,mean)
yv <- as.vector(tapply(response,treatment,mean))
x <- 1:4
model <- lm(yv~x+I(xˆ2)+I(xˆ3))
summary(model)
x <- 1:4
x2 <- xˆ2
x3 <- xˆ3
cor(cbind(x,x2,x3))
t(contrasts(treatment))
y <- as.vector(tapply(response,treatment,mean))
model <- lm(y~poly(x,3))
summary(model)
xv <- seq(1,4,0.1)
yv <- predict(model,list(x=xv))
(bar.x <- barplot(y))
barplot(y,names=levels(treatment))
xs <- -0.5 + 1.2 * xv
lines(xs,yv,col="red")
9.28 Summary of statistical modelling
The steps in the statistical analysis of data are always the same, and should always be done in the following
order:
(1) data inspection (plots and tabular summaries, identifying errors and outliers);
(2) model specification (picking an appropriate model from many possibilities);
(3) ensure that there is no pseudoreplication, or specify appropriate random effects;
(4) fit a maximal model with an appropriate error structure;
(5) model simplification (by deletion from a complex initial model);
(6) model criticism (using diagnostic plots, influence tests, etc.);
(7) repeat steps 2 to 6 as often as necessary