1.ANCOVA
离散程度
斜方差
分类变量和
连续 - 连续+组合变量
分类变量是独立的

给定条件的分布是条件分布,不给定条件则认为是边界分布。我们研究的是条件分布。

a是根据样本的分类,计算出来的离开总均值的距离。

每个值都可以通过下面的进行分析

连续的和方差分析的要求都需要。

回归的齐性。有两个连续变量,都是正的

(1)回归的齐性
回归的齐性。

当存在不齐的时候,如何

对于x的假定

离群值有较大的影响。怎么算,可以算分布的情况,求p值,看它是不是的。


判断异常值的算法。

至少有一个连续变量和一个离散的变量,删除荣誉的

缺失值的情况


# Tensile strength in paper manufacturing
Y <- c(30,35,37,36,34,41,38,42,29,26,33,36,
28,32,40,41,31,36,42,40,31,30,32,40,
31,37,41,40,35,40,39,44,32,34,39,45)
block <- gl(3, 12, 36) # Three blocks
A <- gl(3, 4, 36) # Three pulp preparation methods
B <- gl(4, 1, 36) # Four different temperatures
Dat <- data.frame(Y, block, A, B)
Dat = Dat[-c(1:3), ] # make data unbalanced
summary(aov(Y ~ A*B, data = Dat)) # type I sum of square
Anova(mod <- lm(Y ~ A*B, data = Dat), type = "II") # library(car)
Anova(mod <- lm(Y ~ A*B, data = Dat), type = "III")


一般是三类的情况
data(mtcars)
mtcars$cyl = as.factor(mtcars$cyl)
mtcars$am = as.factor(mtcars$am)
table(mtcars$cyl)
table(mtcars$am)
# SS of mpg
SSE = sum(residuals(lm(mpg ~ 1, data=mtcars))^2) # 1126
# SSE of mpg after cyl being explained
SSE.cyl = sum(residuals(lm(mpg ~ cyl, data=mtcars))^2) # 301
# SSE of mpg after cyl and am being explained
SSE.cyl.am = sum(residuals(lm(mpg ~ cyl+am, data=mtcars))^2) # 264
# SSE of mpg after cyl, am, and cyl:am being explained
SSE.cyl.am.cyl_am= sum(residuals(lm(mpg ~ cyl*am, data=mtcars))^2) # 239
## Type I SS
SS.cyl = SSE - SSE.cyl # SS of cyl, 825
SS.am = SSE.cyl - SSE.cyl.am # SS of am, 37
SS.cyl_am = SSE.cyl.am - SSE.cyl.am.cyl_am # SS of cyl:am, 25
总平均只和组平均相关

用第三类的变化。

样本变量最好均衡

方差分析。

连续变量的差异


偏离斜方差的值



每个连续变量占据一个自由度,离散变量占据水平-1个自由度。
原假设

检查回归的齐性,显不显著(带分析)


当关系复杂的时候,可以将其分成几段。连续变量分成计算。
能做回归,不做方差分析。方差分析比较粗糙。
lines <-
"Foot Elev Landcover Nestsite
22 1230 11 金家村
32 605 16 七氏山后
33 471 16 纸坊街5组
25 1200 11 3组1
32 602 16 7组
28 698 16 2组滚沟
50 471 11 3组2
52 471 11 蔡河4组
38 490 16 3组龙泉
28 648 16 牛河
20 942 16 代家河
20 1250 11 草坝4组
34 477 16 4组云阳
20 1264 11 华阳中学1号
32 681 16 4组黄沟
30 629 16 曹沟
32 483 16 5组麻洞
43 643 11 3组分会田
32 643 11 3组堰岔弯
30 698 16 后沟
28 698 16 汤帽
25 1100 11 草坝5组
35 533 16 党河电站
38 1087 11 高峰5组
45 674 11 7组袁沟
32 624 16 3组
20 757 16 沙溪沟
32 548 16 夏组
45 653 11 1组石洽
27 665 16 2组狗家沟
32 624 16 戴家沟
20 739 16 池塘岸
32 1001 11 8组
42 805 11 2组"
ibis <- read.table(con
<- textConnection(lines),
header=TRUE)
close(con)
#ANCOVA
#Human footprint index, elevation, and landcover
nrow(ibis) #34 nests
ibis$Landcover[ibis$Landcover == 11] <- 1 #Forest
ibis$Landcover[ibis$Landcover == 16] <- 2 #Shrub
ibis$Landcover <- as.factor(ibis$Landcover)
Elev = ibis$Elev; Foot = ibis$Foot; Landcover = ibis$Landcover
plot(Elev, Foot, pch=16+as.numeric(Landcover), col=c('blue',
'red')[as.numeric(Landcover)],cex=1.5)
abline(lm(Foot[Landcover==1]~Elev[Landcover==1]),lty=1, col='blue')
abline(lm(Foot[Landcover==2]~Elev[Landcover==2]),lty=1, col='red')
t检验是两个水平
直接做分析没有显著性的差异
options(digits=3)
tapply(Foot, Landcover, mean)
t.test(Foot ~ Landcover)
anova1 <- lm(Foot~Landcover)
summary(anova1)
ancova <- lm(Foot~Landcover*Elev)
summary(ancova)
加个连续变量是比较回归线,直接比较是比较y值。
anova(ancova)
ancova2 = update(ancova, ~. -Landcover:Elev)
anova(ancova, ancova2)
anova可以展示一个模型所有的量,也可以比较两个模型的差别。(解释上面上说)
减去一个参数
ancova3 = update(ancova2, ~. -Elev)
anova(ancova2, ancova3)
一步一步来进行分析
step(ancova)
l越大,模型越好。k是模型的复杂程度。AIC越小越好。

ancova <- lm(Foot~Landcover+Elev)
summary(ancova)
第一个值截距,第二个两条线的距离,第三个值是斜率。
plot(Foot~Elev, pch=as.numeric(Landcover)+15, col=as.numeric(Landcover))
abline (66.20532, -0.03489, col = 1 ) #
abline (66.20532-14.45224, -0.03489, col=2) #

简单模型会更好
只要它不显著,先把最复杂出掉,再去掉第二复杂的内容。
lines <-
"obs class pre post IQ
1 1 3 10 122
2 2 24 34 129
3 3 10 21 114
4 1 5 10 121
5 2 18 27 114
6 3 3 18 114
7 1 6 14 101
8 2 11 20 116
9 3 10 20 110
10 1 11 29 131
11 2 10 13 126
12 3 3 9 94
13 1 11 17 129
14 2 11 19 110
15 3 6 13 102
16 1 13 21 115
17 2 2 28 138
18 3 9 24 128
19 1 7 5 122
20 2 10 13 119
21 3 13 19 111
22 1 12 17 112
23 2 14 21 123
24 3 7 25 119
25 1 13 17 123
26 2 11 14 115
27 3 10 24 120
28 1 8 22 119
29 2 12 17 116
30 3 9 21 112
31 1 9 22 122
32 2 14 16 125
33 3 7 21 105
34 1 10 18 111
35 2 7 10 122
36 3 4 17 120
37 1 6 11 117
38 2 8 18 120
39 3 7 24 120
40 1 13 20 112
41 2 10 13 111
42 3 12 25 118
43 1 7 8 122
44 2 11 17 127
45 3 6 23 110
46 1 11 20 124
47 2 12 13 122
48 3 7 22 127
49 1 5 15 118
50 2 6 13 127
51 1 9 25 113
52 2 3 13 115
53 1 8 25 126
54 2 4 13 112
55 1 2 14 132
56 1 11 17 93"
scores <- read.table(con <- textConnection(lines), header=TRUE); close(con)
ancova = lm(post.score ~ class.type * pre.score * IQ)
scores$class = factor(scores$class)
ancova = lm(post ~ class * pre * IQ,
data = scores)
summary(ancova)
数据不均衡,用第三类的值。

另一个例子
library(faraway)
data(sexab)
by(sexab, sexab$csa, summary)
plot(ptsd ~ csa, sexab)
plot(ptsd ~ cpa, pch = as.numeric(sexab$csa),
col = as.numeric(sexab$csa), sexab)
m1 <- lm (ptsd ~ cpa+csa+cpa:csa, sexab)
summary (m1)
model.matrix (m1)
m2 <- lm (ptsd ~ cpa+csa, sexab)
summary (m2)
plot(ptsd~cpa, pch=as.numeric(sexab$csa),
col=as.numeric(sexab$csa), sexab)
abline (3.9753, 0.5506, col = 'red' ) # not abused
abline (10.248, 0.5506) # abused, 10.248 = 3.9753 + 6.2728
plot (fitted (m2), residuals (m2), pch=as.numeric(sexab$csa),
xlab= "Fitted", ylab="Residuals")
# change the reference level
sexab$csa <- relevel (sexab$csa, ref="NotAbused") # ref="Abused"
m3 <- lm (ptsd ~ cpa+csa, sexab)
summary (m3)
library(faraway)
data (fruitfly)
plot (longevity ~ thorax, fruitfly, pch=unclass (activity))
legend (0.63, 100, levels (fruitfly$activity), pch=1:5)
g <- lm (longevity ~ thorax*activity, fruitfly)
summary (g)
model.matrix(g)
anova(g)
gb <- lm (longevity ~ thorax+activity, fruitfly)
drop1 (gb, test="F") # drop one term, using F test
summary (g)
将斜变量去掉
summary (g)
drop1 (gb, test="F")

(2)缺失值
如果缺失数据的情况下。用平均值取代是一个很好的方法。
library(faraway)
data(chmiss) # Chicago insurance dataset
head(chmiss)
model <- lm(involact ~ ., chmiss)
summary(model)
cmeans <- apply (chmiss, 2, mean, na.rm=T)
cmeans
mchm <- chmiss
for (i in c(1, 2, 3, 4, 6)) mchm[is.na (chmiss[,i]), i] <- cmeans[i]
model <- lm(involact ~ ., mchm)
summary(model)
1.Mixed effects Ancova

这个是斜率一致的情况
library(nlme)
RIKZ$fBeach <- factor(RIKZ$Beach)
Mlme1 <- lme(Richness ~ NAP, random = ~1 | fBeach, data = RIKZ)
summary(Mlme1)
这个是斜率不一致的情况
library(nlme)
RIKZ$fBeach <- factor(RIKZ$Beach)
Mlme2 <- lme(Richness ~ NAP, random = ~1 + NAP | fBeach, data = RIKZ)
summary(Mlme2)

这个数据是可以在bing上下载的
# The Random Intercept and/or slope Model
RIKZ = read.table('D:/softwares/R/library/AED/data/RIKZ.txt',header=T)
library(nlme)
RIKZ$fBeach <- factor(RIKZ$Beach)
Mlme1 <- lme(Richness ~ NAP, random = ~1 | fBeach, data = RIKZ)
Mlme1 <- lme(Richness ~ NAP, random = ~1 + NAP | fBeach, data = RIKZ)
summary(Mlme1)
# plot regression lines
F0 <- fitted(Mlme1, level = 0) # fitted values obtained by the population model
F1 <- fitted(Mlme1, level = 1) # fitted values obtained by within-beach model
I <- order(RIKZ$NAP); NAPs <- sort(RIKZ$NAP)
plot(NAPs, F0[I], lwd = 4, type = "l",
ylim = c(0, 22), ylab = "Richness", xlab = "NAP")
for (i in 1:9){
x1 <- RIKZ$NAP[RIKZ$Beach == i]
y1 <- F1[RIKZ$Beach == i]
K <- order(x1)
lines(sort(x1), y1[K])
}
text(RIKZ$NAP, RIKZ$Richness, RIKZ$Beach, cex = 0.9)
这个需要模拟最好的情况。
model = lm(Richness ~ NAP * fBeach, data = RIKZ)
anova(model)
pred = predict(model, RIKZ[, c('NAP',
'fBeach')])
plot(Dat$NAP, Dat$Richness, xlab='NAP', ylab='Richness', col='white')
Dat = cbind(RIKZ, pred)
for (i in 1:9) {
Data = Dat[Dat$Beach==i, ]
lines(Data$NAP, Data$pred, col=i)
points(Data$NAP, Data$Richness, col=i, pch=16)
}
混合模型,假设斜率和截距相关。混合像元模型很能表现出,
数据有自相关也可以解决这个问题。
head(ABirds) # data
AP <- c(ABirds$ArrivalAP, ABirds$LayingAP)
SOI2 <- c(ABirds$SOI, ABirds$SOI)
Y2 <- c(ABirds$Year, ABirds$Year)
ID <- factor(rep(c("Arrival", "Laying"), each = 55))
library(nlme)
vf2 <- varIdent(form =~ 1 | ID)
M1 <- gls(AP ~ SOI2 + ID + SOI2:ID, weights = vf2, na.action = na.omit)
M2 <- gls(AP ~ SOI2 + ID + SOI2:ID, weights = vf2, na.action = na.omit,
correlation = corAR1(form =~Y2 | ID))# 这块量化了自相关的情况。
anova(M1, M2)
M3 <- gls(AP ~ SOI2 + ID + SOI2:ID, weights = vf2, na.action = na.omit, method = "ML",
correlation = corAR1(form =~Y2 | ID))
M4 <- gls(AP ~ SOI2 + ID, weights = vf2, na.action = na.omit, method = "ML",
correlation = corAR1(form =~Y2 | ID))
M5 <- gls(AP ~ ID, weights = vf2, na.action = na.omit, method = "ML",
correlation = corAR1(form =~Y2 | ID))
anova(M3, M4, M5)
plot(ABirds$SOI, ABirds$ArrivalAP, ylim = c(195, 260), type = "n",
ylab = "Arrival & laying dates", xlab='SOI')
points(ABirds$SOI, ABirds$ArrivalAP, pch = 1)
points(ABirds$SOI, ABirds$LayingAP, pch = 2)
MyX <- data.frame(SOI2 = seq(from = min(ABirds$SOI),
to = max(ABirds$SOI),
length = 20), ID = "Arrival")
Pred1 <- predict(M3, newdata = MyX)
lines(MyX$SOI2, Pred1)
MyX <- data.frame(SOI2 = seq(from = min(ABirds$SOI),
to = max(ABirds$SOI),
length = 20), ID = "Laying")
Pred2 <- predict(M3, newdata = MyX)
lines(MyX$SOI2, Pred2)


glm广义线性模型,设计到多个参数
gam光滑,非参数
lme混合像元的包
nls 非线性的拟合


2262

被折叠的 条评论
为什么被折叠?



