白萝卜de学习笔记2

最新推荐文章于 2022-03-14 22:49:15 发布

Innfine197

最新推荐文章于 2022-03-14 22:49:15 发布

阅读量429

点赞数

文章标签：学习

本文链接：https://blog.csdn.net/Innfine/article/details/122996323

版权

# w2----------------
# Q2.1---------
rm(list = ls())
N <- 1000
x <- rexp(N,rate=0.5)
hist(x,freq = F)
xx <- seq(0,15,by=0.1)#没记住怎么加的
lines(xx,dexp(xx,rate = 0.5),col='red')#这个叫theorical
lines(density(x),col='blue')#没记住怎么加的，这个叫estimate，要分开
?density()
lines(density(x,kernel = "rectangular"),col='orange')
# 怎么直接画KDE
plot(density(x), lwd=3, col=2)
rug(x)
?rug#将数据的地毯表示(一维图)添加到图中。

density(x)$bw
ud <- density(x,bw=3.789586-3)#under-smoothing
ov <- density(x,bw=3.789586+3)#over-smoothing
plot(ud)
lines(ov)

# Q2.2---------
eps <- 0.95
N <- 100
u <- seq(0,1,length=N)
x <- numeric(N) # 让showing的是generated sample，所以是x不是y
for(i in 1:N){#这个就是这样定义的
if(u[i]>eps){
x[i] <- rt(1,df=3)
}else{
x[i] <- rnorm(1)
}

}
hist(x)

# Q2.3---------
n <- 1000
set.seed(4060)
# (a)
x1 <- rnorm(n,mean=2,sd=1)
f1 <- density(x1)
hist(x1,freq = F)
xx1 <- seq(-1,7,by=0.1)
lines(xx1,dnorm(xx1,mean = 2,sd=1),col='red')
lines(f1,col='blue')
f1 <- density(x1,adjust =1.3)
lines(f1,col='orange')
mse <- mean((dt(density(x2)$x,df=3)-f2)^2)
lines(density(x2,adjust = 1.5),col='orange')
# (b)
x2 <- rt(n,df=3)
f2 <- density(x2)$y
hist(x2,freq = F,ylim = c(0,0.6))#这里要控制y的范围！！！
xx2 <- seq(-15,15,by=0.1)
lines(xx2,dt(xx2,df=3),col='red')
lines(density(x2),col='blue')

hist(x2,freq = F,ylim = c(0,0.6),breaks = 21)
xx2 <- seq(-15,15,by=0.1)
lines(xx2,dt(xx2,df=3),col='red')
lines(density(x2),col='blue')

# (b)
x3 <- rexp(n,rate = 2)
f3 <- density(x3)$y
hist(x3,freq = F,ylim = c(0,1.5))
lines(density(x3),col='blue')
xx3 <- seq(0,3.5,length=1000)
lines(xx3,dexp(xx3,rate=3),col='red')

# Q2.4---------
library(KernSmooth)
dat <- faithful
# (a)
bw = apply(faithful, 2, sd)/2
fx <- bkde2D(dat,bandwidth = bw)
plot(dat,xlim=c(1,6),ylim=c(35,105))
contour(fx$x1, fx$x2, fx$fhat,add = T)#没记住
persp(fx$x1, fx$x2, fx$fhat, shade=.15, theta=-30, phi=15)

# 总结
# h越小，bw越小，adjust越小，KDE越squiggly，越高&narrow
# h越da，bw越大，adjust越大，KDE越smooth，越矮

# (b)
lp.fit = locpoly(faithful$eruptions, faithful$waiting, bandwidth=1)#没记住
# 只问local polynomial就这么写,bw越小越高，这个不是默认值，自己找一个
plot(faithful, pch=20)
lines(lp.fit, lwd=3, col=1)
?locpoly

# Q2.5---------
# 这里直接不会
# (a)
n = 1000
set.seed(4060)
Mu = c(1,4)#自己设一个
Sigma = c(2,1)#自己设一个
X1 = rnorm(N, mean=Mu[1], sd=Sigma[1])
X2 = rnorm(N, mean=Mu[2], sd=Sigma[2])
X = cbind(X1,X2)

# (b)
library('MASS')
bw = c(.5, .5) * 4
f.kde2d = kde2d(X1, X2, h=bw) # xy是说两个而已，不是x和y
names(f.kde2d)
plot(X,xlim=c(-6,12),ylim=c(0,8))
contour(f.kde2d$x,f.kde2d$y,f.kde2d$z,add = T)#这里是要3个的，或者直接写f.kde2d就行

# (c)不考

# w3----------------
rm(list = ls())
# Q5.1 MC integration-----
# 求e^(-x) 在（2，4）上的积分(0.117)没记住
# g(x)=e^(-x); x～f(x)=1/2=UNIF(2,4); 2*E[g(x)]=所求
# x～f(x)=1/2=UNIF(2,4)很重要！！！！！
M <- 1000
a <- 2
b <- 4
x <- runif(M,a,b)
theta.est <- (sum(exp(-x))/M)*2;theta.est

# Q5.2-------
N <- 1000
x <- 1
a <- x
b <- -x
t <- runif(N,min=b,max = a)
prob <- ((sum((1/sqrt(2*pi))*exp(-0.5*(t^2))))/N)*2
rs <- 0.5+prob/2
# check:
pnorm(1)
# evaluate variance的话我们加一个MC求出estimator
M <- 999
v <- numeric(M)
for(i in 1:M){
t <- runif(N,min=b,max = a)
prob <- ((sum((1/sqrt(2*pi))*exp(-0.5*(t^2))))/N)*2
rs <- 0.5+prob/2
v[i] <- rs
}
mean(v)#check
var(v)
quantile(v,c(0.025,0.975))

# Q5.3-------
# (a)
n <- 1000
x <- rnorm(n)
is <- order(x)
xk <- x[is]
k <- 0
xnew <- xk[c((k+1):(n-k))]
length(xnew)

mn <- numeric(n/2)
for(k in 0:(n/2-1)){
xnew <- xk[c((k+1):(n-k))]
mn[k+1] <- mean(xnew)
}
mn

# (b)
k <- 1 #这里k是定下的
M = 1000
n = 20 #sample size
tm <- numeric(M)
mn <- numeric(M)
for (m in 1:M) {
x <- rnorm(n,mean=15,sd=3)
mn[m] <- mean(x)
xs <- sort(x)
xnew <- xs[c((k+1):(n-k))]
tm[m] <- mean(xnew)
}

mse1 <- mean((mn-15)^2)
mse2 <- mean((tm-15)^2)
b1 <- mean(mn-15)
b2 <- mean(tm-15)
mse1
var(mn)+b1
mse2
var(tm)+b2

# Q5.4-------
# (a)
N <- 50
M <- 100
a <- 7
b <- 3
x <- rep(c(1:5), N/5)
m <- 0.5
s <- 1.2
rseed=0
LSvec = RMvec = matrix(0,2,M)
set.seed(rseed)
library(MASS)
library(VGAM)
# (b)
for(m in 1:M){
e <- rlnorm(N,m,s)
y <- a+b*x+e
mylm <- lm(y~x)
myrlm <- rlm(y~x,method="MM")
}

# Q5.5-------
M <- 10000
N <- 50
x <- rnorm(N,mean = 0,sd=2)
s1 <- s2 <- numeric(M)
?var()
for(m in 1:M){
x <- rnorm(N,mean = 0,sd=2)
s1[m] <- sd(x)
s2[m] <- sqrt(mean((x-mean(x))^2))#这个是诱有偏的，记住
}
mean(s1)
mean(s2)
boxplot(s1,s2)

M <- 10000
N <- 100
x <- rnorm(N,mean = 0,sd=2)
s1 <- s2 <- numeric(M)
?var()
for(m in 1:M){
x <- rnorm(N,mean = 0,sd=2)
s1[m] <- sd(x)
s2[m] <- ((N-1)*s1[m])/N
}
sd(s1)
sd(s2)
boxplot(s1,s2)

# w4----------------
rm(list = ls())
iris <- iris
x <- subset(iris$Sepal.Length,iris$Species=='setosa')
xbar <- mean(x,na.rm = T)
n <- length(x)
B <- 100
xb <- numeric(B)
for(b in 1:B){
#idx <- sample(1:n,n,replace = T)
tst <- sample(x,n,replace = T)#所以这是一样的
#xnew <- x[idx]
xb[b] <- mean(tst)
}
mean(xb)
mean(xb)-xbar#bias
#estimate of SE:
sd(xb)

# Q5.9-------
cars <- cars
x <- cars$speed
y <- cars$dist
B <- 10000
n <- nrow(cars)
lm1 <- lm(y~x)
summary(lm1)
lm1$coefficients[2]
slp <- itc <- numeric(B)
for(b in 1:B){
idx <- sample(1:n,n,replace = T)
ynew <- y[idx]
xnew <- x[idx]
lmb <- lm(ynew~xnew)
slp[b] <- lmb$coefficients[2]
itc[b] <- lmb$coefficients[1]
}
# Bs SE estimate:
sd(slp)
sd(itc)
#Bs estimate of bias:
mean(slp)-coef(lm1)[2]
#naive bootstrapping 95%Ci
quantile(slp,c(0.25,0.975))

# MC matrix-------
# 让matrix里的每一个值都是MC的mean estimate
M <- 100
mu <- 10
sigma <- 3
Ns <- c(20,30,50,100,200,500)
mt <- matrix(NA,nrow = M,ncol = length(Ns))
for (j in 1:length(Ns)) {
for (m in 1:M) {
x <- rnorm(Ns[j],mean = mu,sd=sigma)
mt[m,j] <- mean(x)
}
}
apply(mt,2,mean)
boxplot(mt,names=Ns,xlab='sample size N')
#计算theoretic CI：
lb <- mu-1.96*sigma/sqrt(Ns)#lower bound
ub <- mu+1.96*sigma/sqrt(Ns)#upper bound
#计算emprical/pracital CI：
lb <- apply(means,2,mean)-1.96*sigma/sqrt(Ns)#lower bound
ub <- apply(means,2,mean)+1.96*sigma/sqrt(Ns)#upper bound

# Q5.6-------
M <- 100
ndf <- c(2,4,10)
n <- 100
mt <- matrix(NA,nrow = M,ncol = 3)
for(j in 1:3){
for (i in 1:M) {
x <- rchisq(n,df=ndf[j])
mt[i,j] <- mean(x)
}
}
boxplot(mt[,1],mt[,2],mt[,3])
#可以看出来的是df=E(x),没记住
sd <- apply(mt,2,sd)#the estimates of standard error of xbar

# w5----------------
# 20-21 的CA1 Q1------
set.seed(6040)
M <- 1000
N <- 1000
r <- numeric(M)
for(m in 1:M){
x <- sample(1:N,N,replace = T)
xnew <- unique(x)
r[m] <- length(xnew)/N
}
mean(r)

# 20-21 的CA1 Q2------
set.seed(6015)
M <- 1000
N <- 100
a <- 3
b <- 2
mn <- numeric(M)
for(m in 1:M){
x <- rgamma(N,shape = a,rate = b)
mn[m] <- mean(x)
}
# (a)
mean(mn)
# (b)
# 不惊奇，因为true value是1.5，MN越大，那么这个值越接近1.5
# (c)
sd(mn)

# 20-21 的CA1 Q3------
# 看好了，只有feet和girth
set.seed(4060)
dat <- trees
B <- 100
slp <- numeric(B)
n <- nrow(dat)
for(b in 1:B){
idx <- sample(c(1:n),n,replace = T)
ynew <- dat$Height[idx]
xnew1 <- dat$Girth[idx]
lmb <- lm(ynew~xnew1+xnew2)
slp[b] <- lmb$coefficient[2]
}

lmb <- lm(dat$Height~dat$Girth+dat$Volume)
lmb$coefficients[2]
# (a)
boxplot(slp)
# (b)
mean(slp)
# (c)
sd(slp)
# (d)
# Bs的empirical CI
lso <- lm(y~x)$coef#这个就是y～x的斜率
2*lso[2] - quantile(slp, probs=c(0.975,0.025))#这个没写出来

# w6----------------
mtcars <- mtcars
n <- nrow(mtcars)
dat <- mtcars[sample(1:n,n),]
y = dat$mpg
x <- dat[,-1]
glm1 <- glm(y~.,data = x)
yhat <- glm1$fitted.values#取fitted value
plot(y,yhat)
boxplot((y-yhat)^2)
mean((y-yhat)^2)

train.idx <- c(1:floor(n/2))
train.idx <- c(1:16)
glm.train <- glm(y[train.idx]~.,data = x[train.idx,])
o1 <- glm(y~.,data = x,subset = train.idx)#一样的
ytrain <- glm.train$fitted.values

ytst <- predict(glm.train,newdata = x[-train.idx,])
mse.train <- mean((glm.train$residuals)^2)
mse.tst <- mean((y[-train.idx]-ytst)^2)
boxplot((glm.train$residuals)^2,(y-ytst)^2)
boxplot((ytrain-y[train.idx])^2)

# 加上Bs：
B <- 100
mseb.train <- mseb.tst <- numeric(B)
for(b in 1:B){
idx <- sample(1:n,n,replace = T)
glm.t <- glm(y~.,data=x,subset = idx)
mseb.train[b] <- mean((glm.t$residuals)^2)
ytst <- predict(glm.t,newdata=x[-idx,])
mseb.tst[b] <- mean((y[-idx]-ytst)^2)
}
boxplot(mseb.train,mseb.tst)

# k-fold CV
K <- 5
msecv.train <- msecv.tst <- numeric(K)
folds <- cut(1:n,K,labels = F)
table(folds)
for(k in 1:K){
idx <- which(folds!=k)
glm.t <- glm(y~.,data = x,subset = idx)
msecv.train[k] <- mean((glm.t$residuals)^2)
ytst <- predict(glm.t,newdata = x[-idx,])
msecv.tst[k] <- mean((y[-idx]-ytst)^2)
}
boxplot(msecv.train,msecv.tst)

B <- 10
mseb.train <- mseb.tst <- numeric(B)
for(b in 1:B){
idx <- sample(1:n,n,replace = T)
glm.t <- glm(y~.,data=x,subset = idx)
mseb.train[b] <- mean((glm.t$residuals)^2)
ytst <- predict(glm.t,newdata=x[-idx,])
mseb.tst[b] <- mean((y[-idx]-ytst)^2)
}
boxplot(mseb.train,mseb.tst)

# Bs的话用了32-32*0.632个，大约是10个做test
# CV里K=3的话那么大约用32/3=10个做test
# 所以我们期望K=3时两对error是一样的，但是k=5，CV的
# K-fold更accurate，不那么widespread，如果增加B widespread会变好，但是会得到很多outliers

# Q5.8--------------
rm(list = ls())
library(bootstrap)
dat <- law
cor(dat$LSAT,dat$GPA)
B <- 1000
n <- nrow(dat)
cr <- numeric(B)
for(b in 1:B){
idx <- sample(1:n,n,replace = T)
x1 <- dat$LSAT[idx]
x2 <- dat$GPA[idx]
cr[b] <- cor(x1,x2)
}
mean(cr)
# bias
mean(cr)-cor(dat$LSAT,dat$GPA)
# native CI
quantile(cr,c(0.025,0.975))

# Q5.9--------------
rm(list = ls())
dat <- cars
# (a)
lm1 <- lm(dat$dist~dat$speed)
lm1$coefficients
# (b)
B <- 10000
n <- nrow(dat)
slp <- numeric(B)
itc <- numeric(B)
for(b in 1:B){
idx <- sample(1:n,n,replace = T)
xb <- dat$speed[idx]
yb <- dat$dist[idx]
lm <- lm(yb~xb)
slp[b] <- lm$coefficient[2]
itc[b] <- lm$coefficient[1]
}
lm1$coefficient
mean(slp)
mean(itc)
# (c)
# 不会
# 这个题就是在说让看看两个estimator的分布，分布的话用hist就行了
hist(slp)
hist(itc)
# (d)
# 不会
# 记住怎么看joint distr------
library(ellipse)
plot(itc,slp,cex=0.5,pch=20)
e <- ellipse(lm1)
plot(e,t='l')
bcoef <- cbind(itc,slp)
points(bcoef,pch=20,col=8,cex=.5)
points(itc,slp,pch=20,col=1,cex=.5)#是一样的
sd(slp)
sd(itc)

# 用命令直接Bs--------
library(boot)
#自己定义一个func，但是这个func的第一个argument必须是data，第二个必须是bs的idx：
my.mean <- function(x, ib){
return( mean(x[ib]) )
}
B = 100
x = cars$dist
ob = boot(x, my.mean, R=B)
# ob里的original就是true value，ie mean（x）

# Q5.10 这个是nls怎么用----------
rm(list = ls())
x = mtcars$disp
y = mtcars$mpg
plot(x,y,pch=20)#有inflexion（屈折），所以用nonlinear可能会更好
# nonlinear regression比如polynomial/exp
theta <- c(3,-0.01)
nls1 <- nls(y~exp(a+b*x),start = list(a=3,b=-0.01))
fitted(nls1)
plot(fitted(nls1),y)
plot(fitted(nls1)-y)#这个是residual，应该没有pattern才对
hist(fitted(nls1)-y)

# (a)这是怎么着nls的参数--------
summary(nls1)
summary(nls1)$parameter[1,1]
summary(nls1)$parameter[2,1]

plot(x,y,pch=20)
is = order(x)#返回的事从小到大的位置
#20 19 18 26 28 3 ......
x[20]
min(x)
lines(x[is], fitted(nls1)[is], pch=20, cex=2, col=4, lwd=2)
?order

# (b)
z <- density(x)
plot(z)

# (c)
set.seed(1)
B = 100
n <- length(x)
theta1 <- numeric(B)
theta2 <- numeric(B)
for(b in 1:B){
idx <- sample(1:n,n,replace = T)
xnew <- x[idx]
ynew <- y[idx]
bnl <- nls(ynew~exp(a+b*xnew),start = list(a=3,b=-0.01))
theta1[b] <- summary(bnl)$parameter[1,1]
theta2[b] <- summary(bnl)$parameter[2,1]
}
mean(theta1)
mean(theta2)
sd(theta1)
sd(theta2)
#se:
sd(theta1)
# nonparametric 95% confidence intervals 就是native

# Q2.11----------
# coerce强迫
rm(list = ls())
library(MASS)
x = Boston[,c('crim','indus','rm','tax')]
y = Boston$medv
lm1 <- lm(y~.,data = x)
# (a)
n <- nrow(x)
a <- c(1:floor(n/2))
lma <- lm(y~.,data = x,subset = a)
preds <- predict(lma,newdata = x[-a,])
length(preds)
rmsea <- sqrt(mean((y[-a]-preds)^2))
# (b)--------leave-one-out CV
preds <- resi <- numeric(n)
for(i in 1:n){
tot <- c(1:n)
b <- tot[-i]
lmb <- lm(y~.,data = x,subset = b)
preds[i] <- predict(lmb,newdata = x[i,])
resi[i] <- y[i]-preds[i]
rmseb <- sqrt(mean((resi)^2))
}
# (c)
K <- 5
folds <- cut(1:n,K,labels = F)
rmse <- numeric(K)
for(k in 1:K){
train.idx <- which(folds!=k)
tst.idx <- which(folds==k)
lmc <- lm(y~.,data = x,subset = train.idx)
predsc <- predict(lmc,newdata = x[tst.idx,])
rmse[k] <- sqrt(mean((y[tst.idx]-predsc)^2))
}

# 但是这里我们忘记打乱了，so：
dat <- Boston[sample(1:n,n),]
x = dat[,c('crim','indus','rm','tax')]
y = dat$medv
K <- 5
folds <- cut(1:n,K,labels = F)
rmse <- numeric(K)
for(k in 1:K){
train.idx <- which(folds!=k)
tst.idx <- which(folds==k)
lmc <- lm(y~.,data = x,subset = train.idx)
predsc <- predict(lmc,newdata = x[tst.idx,])
rmse[k] <- sqrt(mean((y[tst.idx]-predsc)^2))
}
rmse

# 重复R次的CV---------
# R里的每一次都要打乱一次
R <- 10
K <- 5
folds <- cut(1:n,K,labels = F)
rmse <- numeric(K*R)
for (r in 1:R) {
is <- sample(1:n,n)
x <- x[is,]
y <- y[is]
for(k in 1:K){
train.idx <- which(folds!=k)
tst.idx <- which(folds==k)
lmc <- lm(y~.,data = x,subset = train.idx)
predsc <- predict(lmc,newdata = x[tst.idx,])
rmse <- c(rmse,sqrt(mean((y[tst.idx]-predsc)^2)))#这是在nested for loop里怎feed in
}
}
rmse

# w7----------------
rm(list = ls())
# 课堂例题 section3 P8的--------
attach(pressure)
x <- temperature
y <- pressure
plot(x,y,pch=20,cex=0.5)#很明显这不是一个linear pattern
lm1 <- lm(y~x)
lines(x,fitted(lm1))

lmf <- lm(y~x+I(x^2)+I(x^3)+I(x^4)+I(x^5))
plot(x,y,pch=20,cex=0.5)
lines(x,fitted(lmf))

nls1 <- nls(y~exp(a+b*x),start = list(a=0,b=0.02))
summary(nls1)
lines(x,fitted(nls1),col='red')
cc <- coef(nls1)
coefficients(nls1)
curve(exp(cc[1]+cc[2]*x),col='blue',add=T)#和lines一样

nls1 <- nls(y~exp(a+b*x),start = list(a=0,b=0.02))
summary(nls1)
nlstst <- nls(y~exp(a+b*x),start = list(a=0,b=0.02),trace = T)
summary(nlstst)

crit <- function(theta,x,y){
return( sum( (y-x*theta)^2 ) )
}
thbar = 1.8# theta的true value
#x = rep(c(1,3,7,8), len=100)
#y = x*thbar + rnorm(x)
optim.out = optim(par=c(1), fn=crit, x=x, y=y, method='L-BFGS-B',lower=c(-1), upper=c(0))
optim.out # $par是theta的值
curve(exp(1e-07*x),col='blue',add=T)

# grid search----------
# nls(y~exp(a+b*x),start = list(a=0,b=0.02))
# 现在假设一点都不知道ab的值
x <- temperature
y <- pressure
L <- 10
a.grid <- seq(-1,1,length=L)
b.grid <- seq(0.01,0.5,length=L)
crit <- matrix(NA,ncol=L,nrow=L)
for(i in 1:L){
for(j in 1:L){
crit[i,j] <- sum( (y-exp(a.grid[i]+b.grid[j]*x))^2 )

}
}
persp(a.grid,b.grid,crit,theta=45,col = 'lightblue')

b.grid <- seq(0.01,0.02,length=L)
crit <- matrix(NA,ncol=L,nrow=L)
for(i in 1:L){
for(j in 1:L){
crit[i,j] <- sum( (y-exp(a.grid[i]+b.grid[j]*x))^2 )

}
}
persp(a.grid,b.grid,crit,theta=45,col = 'lightblue')
contour(a.grid,b.grid,crit)

#什么样的图才是好的grid search---------
# BONUS: a nicer example of a criterion surface plot...
rm(list=ls())
set.seed(4060)
n = 100
x = runif(n,-1,1)
y = .2*x+.3*x^2
plot(x,y)

L = 100
ag = seq(-1,1,length=L)
bg = seq(-1,1,length=L)
crit = matrix(NA,nrow=L,ncol=L)
for(i in 1:L){
for(j in 1:L){
crit[i,j] = sum( (y-ag[i]*x-bg[j]*x^2)^2 )
}
}
persp(ag,bg,crit,col='lightblue',theta=45)
contour(ag,bg,crit)

# how to fit any model using optim()----------
remove(list = ls())
x <- temperature
y <- pressure
crit <- function(th,x,y){
a <- th[1]
b <- th[2]
return(sum(y-exp(a+b*x))^2)
}
range(x)
range(y)
?optim
opt <- optim(par = c(0,0.02),fn=crit,x=x,y=y)
# 参数par：Initial values for the parameters to be optimized over.
opt$par
opt <- optim(par = c(0,0.02),fn=crit,x=x,y=y,
method = 'L-BFGS-B', lower=c(0.01,0.02), upper=c(0.1,0.03))
opt$par
opt <- optim(par = c(0,0.02),fn=crit,x=x,y=y,
method = 'L-BFGS-B', lower=c(0,0.02), upper=c(0.1,0.03))
opt$par
opt <- optim(par = c(0,0.02),fn=crit,x=x,y=y,
method = 'L-BFGS-B', lower=c(-1,0.02), upper=c(0.1,0.03))
opt$par
opt <- optim(par = c(0,0.02),fn=crit,x=x,y=y,
method = 'L-BFGS-B', lower=c(-0.6,0.02), upper=c(0.1,0.03))
opt$par

nls1 <- nls(y~exp(a+b*x),start = list(a=0,b=0.02))
summary(nls1)

yhat <- exp(opt$par[1]+opt$par[2]*x)

plot(x,y)
lines(x,fitted(nls1),col='red')
lines(x,yhat,col='blue')

opt1 <- optim(par = c(0,0.02),fn=crit,x=x,y=y)
yhat1 <- exp(opt1$par[1]+opt1$par[2]*x)
lines(x,yhat1,col='orange')#这个接近nls

# 那么我们应不应该加第三个参数c呢？
crit2 <- function(th,x,y){
# Must have theta as 1st parameter and return a single value...
# Here we implement a simple linear regression model
# for the sake fo the example:
a = th[1]
b = th[2]
c = th[3] ## additional scaling parameter
return( sum( (y-c*exp(a+b*x))^2 ) )
}
opt2 <- optim(par = c(0,0.02,1),fn=crit2,x=x,y=y)
opt2$par
yhat2 <- opt2$par[3]*exp(opt2$par[1] + (opt2$par[2])*x)#因为是scale parameter
# The result is not great:因为跟不加c的一摸一样，所以不用加这个paremeter了
plot(x, y, pch=20,
xlab="Temperature (Celsius)",
ylab="Vapor pressure (ml of mercury)",
main="Example: polynomial regression")
lines(x, yhat2, col='navy')
lines(x, fitted(nls1), col='blue')
lines(x, yhat, col=2, lwd=3)

# w8----------------
#1D overfitting
rm(list = ls())
dat <- na.omit(airquality)
x <- dat$Temp
y <- dat$Ozone
plot(x,y)
dat <- data.frame(x,y)#为了方便写subset和predict,是data.frame不是as.data.frame!!!!
x <- data.frame(x)#这样不行！！！！

n <- nrow(x)
train.idx <- which(y<100)#which返回的也是位置
#tst.idx <- which(y>=100)
lmt <- lm(y~.,data = dat,subset = train.idx)
preds <- predict(lmt,newdata = dat[-train.idx,])
sqrt(mean( (preds-y[-train.idx])^2 ))

# multivariate overfitting
rm(list = ls())
dat <- mtcars
n <- nrow(dat)
R <- 20
K <- 2
fit <- numeric(K*R)
mse <- numeric(K*R)
fit.l <- numeric(K*R)
mse.l <- numeric(K*R)
folds <- cut(1:n,K,labels = F)
set.seed(4060)
library('glmnet')
for( r in 1:R){
dat <- dat[sample(1:n,n),]
x <- dat[,-1]
y <- dat$mpg
xm <- as.matrix(x)#全部都用这个就行，防止x有categorical
#glm可以自己识别categorical
for(k in 1:K){
train.idx <- which(folds!=k)
glm1 <- glm(y~.,data = x,subset = train.idx)
yh <- fitted(glm1)
fit <- c(fit,mean((yh-y[train.idx])^2))
preds <- predict(glm1,newdata=x[-train.idx,])
mse <- c(mse,mean((preds-y[-train.idx])^2))
cv.lb = cv.glmnet(xm[train.idx,],y[train.idx],grouped=FALSE)
# 10-fold cv
# 上是怎么找lasso的lambda（高阶参数，不参与model，只是scale）
lasso = glmnet(xm[train.idx,],y[train.idx],lambda=cv.lb$lambda.min)
yh.l = predict(lasso,newx=xm[train.idx,])#怎么求yhat
fit.l[k+(r-1)*K] = mean((yh.l-y[train.idx])^2)
yp.l = predict(lasso,newx=xm[-train.idx,])# test的prediction
mse.l[k+(r-1)*K] = mean((yp.l-y[-train.idx])^2)#MSE
}
}
boxplot(fit,mse,fit.l,mse.l,
names=c("glm fit","glm error","lasso fit","lasso error"),
ylim=c(0,300))
boxplot(fit,mse,fit.l,mse.l,
names=c("glm fit","glm error","lasso fit","lasso error"),
ylim=c(0,25))

#现在这里有categorical variable了
rm(list = ls( ))
library(ISLR)
set.seed(6026)
dat = na.omit(Hitters)
n = nrow(dat)
dat$Salary = log(dat$Salary)# 原始dta太skewed了，取log让data更normal

REPS = 50
train.error = test.error = numeric(REPS)
lasso.train.error = lasso.test.error = numeric(REPS)

for(j in 1:REPS){
dat = dat[sample(1:n, n, replace=FALSE),]

i.train = c(1:200)# 用前200个作为training，后63个当test
train = dat[i.train,]
test = dat[-i.train,]
fit = glm(Salary~., data=train)
pred = predict(fit, newdata=test)

train.error[j] = mean(fit$residuals^2)
test.error[j] = mean((pred-test$Salary)^2)

xm = model.matrix(Salary~., data=train)[,-1]
newxm = model.matrix(Salary~., data=test)[,-1]
y = train$Salary
lam = cv.glmnet(xm, y, data=train)
lasso = glmnet(xm, y, data=train, lambda=lam$lambda.min)
lasso.pred = predict(lasso, newx=newxm)

lasso.fit = predict(lasso, newx=xm)
lasso.train.error[j] = mean((lasso.fit-train$Salary)^2)
lasso.test.error[j] = mean((lasso.pred-test$Salary)^2)
}

summary(fit)
coef(lasso) #怎么看被lasso mute down了-----------

par(font=2, font.axis=2)
boxplot(train.error, test.error,
main="MLB player salary prediction",
names=c("Training error", "Test error"),
col=c('pink','cyan'))

# 另一个例子Prestige example
rm(list = ls( ))
library(car) # contains dataset Prestige
library('glmnet')
pairs(Prestige)
y = Prestige$income
x = Prestige[,c("education","prestige","women")]
class(x) # note x is of class "data.frame" as opposed to "matrix"
lmo = lm(y~., data=x) # this works though (check out ?lm)
summary(lmo)
# how to predict from this fit:
newx = data.frame(education=seq(11,16,length=5),prestige=seq(14,80,length=5),women=seq(10,90,length=5))
lmo.pred = predict(lmo, newdata=newx)
# 只predict一个：
newx = data.frame(education=c(13),prestige=c(61),women=c(12))
lmo.pred = predict(lmo, newdata=newx)
xm = model.matrix(y~., data=x)[,-1]
lasso.cv = cv.glmnet(xm, y, alpha=1)#Cross-validation for glmnet
lasso = glmnet(xm, y, alpha=1, lambda=lasso.cv$lambda.min)
# let's compare LASSO with our earlier linear regression output:
cbind(coef(lasso), coef(lmo))
# how to predict from this fit:
# ?predict.glmnet
class(newx)
lasso.pred = predict(lasso, newx=as.matrix(newx))
# NB:
cbind(lasso.pred, lmo.pred) # close enough to the lm() predictions
mean((lasso.pred-lmo.pred)^2)
# percentage error-----------
mean((lasso.pred-lmo.pred)/lmo.pred)*100 # less than 1% error...
#只有-0.1155595，所以origional model是够好的

# 另一个model： Blood Pressure example
library(glmnet)
setwd('/Users/apple/Desktop/22-r/ST4060/ST4060txt-csv')
dat = read.table(file="Blood Pressure.txt",header=TRUE)

str(dat)
dat$PatientID = NULL # getting rid of this variable

# GLM fit
glm.fit = glm(Systolic~., data=dat)
mean(glm.fit$residuals^2)

# LASSO - note: fitting these models is rather cody...
alpha = 1
# prepare the data:
xm = model.matrix(Systolic~., data=dat)[,-1]
y = dat$Systolic
# compute best lambda:
lam = cv.glmnet(xm, y, alpha=alpha)
# c) fit model using best lambda:
lasso = glmnet(xm, y, lambda=lam$lambda.min)
# d) recover fitted values:
lasso.fit = predict(lasso, newx=xm)
# e) calculate RSS or MSE from LASSO
mean((lasso.fit-y)^2)

rm(list = ls( ))
dat = na.omit(airquality)
x = dat$Temp
y = dat$Ozone
plot(x,y)

nl.crit <- function(x, y, a, b){
return( mean( (y - a*exp(b*x))^2 ) )
}

# GRID SEARCH:----------
Pa = Pb = 100
a.grid = seq(0,2,length=Pa)
b.grid = seq(0,1,length=Pb)

crit.values = matrix(NA, nrow=Pa, ncol=Pb)
for(i in 1:Pa){
for(j in 1:Pb){
crit.values[i,j] = nl.crit(x,y,a.grid[i],b.grid[j])
}
}

ij.best = which.min(crit.values)
ij.best = arrayInd(ij.best, .dim=dim(crit.values))
# location of the minimum of the surface

# grid search solutions:
a.best = a.grid[ij.best[1]]
b.best = b.grid[ij.best[2]]
fitted = a.best * exp(b.best*x)

nlso = nls(y~a*exp(b*x), start=list(a=a.best, b=b.best))
a.best
b.best
plot(x, y, pch=20, xlab="Temperature (degrees F)",
ylab="Ozone (ppb)",
xlim=c(55,105))
points(x, fitted(nlso), pch=20, col='orange')

# 直接用optim也能smooth---------
dat = na.omit(airquality)
x = dat$Temp
y = dat$Ozone
#### Pick a nonlinear model and fit it using optim()
# First, write a function that evaluates the criterion:
criterion <- function(param,x,y){
# The function has to be defined this way
# (with param as its 1st argument)
# if we are going to apply optim() to it... cf. ?optim
a = param[1]
b = param[2]
return( mean( (y - a*exp(b*x))^2 ) )
}
# Then, apply optim() to it:
oo = optim(par=list(a=1, b=0.01), fn=criterion, x=x, y=y)
# now reconstruct fitted values from optim() output:
a.hat = oo$par[1]
b.hat = oo$par[2]
y.hat = a.hat * exp(b.hat*x)
plot(x,y)
is <- order(x)
lines(x[is], y.hat[is], pch=21, col=2)

# w9----------------
# B-spline--------
# using dataset ahmd$mx from library MortalityLaws, plot the crude force of mortality curve.
rm(list = ls())
library('MortalityLaws')
age <- c(0:110)
mM <- ahmd$mx[,4]
plot(dat$Age,mM,t='b',pch=20)#Crude force of mortality (2010)
plot(age,log(mM))
library('splines')
BM3 <- bs(age)
BM6 <- bs(age,df=6)
matplot(age,BM3,t='b')#Plot Columns of Matrices
matplot(age,BM6,t='b')
BMtst <- bs(age,knots = F)#

# Compute the design matrix for 3 dof’s and quote the projection for age 40 onto
# it. Then reconstruct the B-spline approximation to the crude male force data
# using linear regression. Plot the output over the data points and criticize.

age <- c(0:110)
mM <- ahmd$mx[,4]
BM <- bs(age)# compute bases
lm1 <- lm(mM~BM)
y <- numeric(length(age))#一个vector
for (i in 1:length(age)) {
y[i] <- as.numeric(sum(lm1$coef[2:4]*BM[i,])+lm1$coef[1])
}
lm1$coefficients
plot(age,mM,pch=20)
lines(age,y,t='l',col=4)# 这两条线是一样的
lines(age,fitted(lm1),t='l',col=2)
y-fitted(lm1)

# 更好的B-spline：
BM <- bs(age,df=6)# compute bases
lm1 <- lm(mM~BM)
y <- numeric(length(age))
for (i in 1:length(age)) {
y[i] <- as.numeric(sum(lm1$coef[2:7]*BM[i,])+lm1$coef[1])
}
plot(age,mM,pch=20)
lines(age,y,t='l',col=4)
lines(age,fitted(lm1),t='l',col=2)

# Continuing from previous exercise, compute the B-spline as a power series,
# i.e. compute the traditional cubic spline representation
# S(x)=β0+β1x+β2x2+β3x3+∑j=1Jαj(x?6?1xj)3+
# where (x?6?1xj)3+=max((x?6?1xj)3,0), using J=3 interior knots at 40, 60 and 80.
# Compare with the fitted values from the regression onto the design matrix
# obtained from splines::bs() for those knots.
x <- c(0:110)
mM <- ahmd$mx[,4]
BM <- bs(x,knots = c(40,60,80))#也叫design matrix
lm2 <- lm(mM~BM)

x <- 41
pmax((x-40)^3,0)
x <- 39
pmax((x-40)^3,0)

k40 <- pmax((x-40)^3,0)
k60 <- pmax((x-60)^3,0)
k80 <- pmax((x-80)^3,0)
lm3 <- lm(mM~x+I(x^2)+I(x^3)+k40+k60+k80+0)

plot(x,mM,pch=20)
lines(x,fitted(lm3),col='red')
lines(x,fitted(lm2),col='blue')

par(mfrow=c(1,2))
matplot(BM)# original basis for B-spline
POW <- cbind(x,x^2,x^3,k40,k60,k80)
matplot(POW)

# w10----------------
# loess是多个x，lowess只有一个x
rm(list = ls())
dat = na.omit(airquality)
t = dat$Temp
y = dat$Ozone
plot(t,y,xlim = c(50,110) )
range(t)
newx <- c(95:100)
abline(v=newx,col='blue')
lo <- loess(y~t)
summary(lo)
# check output values are in the same order as input values:
summary(c(lo$x - x)) # ok, they are
summary(c(lo$y - y)) #都是相同order
pred.lo <- predict(lo,newdata = data.frame(t=newx))# 这个t是喂进去的
points(newx,pred.lo,col='red')#预测不了出了x的range 的点

low <- lowess(t,y)
summary(low)
summary(c(low$x - t)) # NO, THEY ARE NOT
summary(c(low$x - sort(t))) # coz lowess() sorted the input values!
sqrt( mean( (y-fitted(nlso))^2 ) )
sqrt( mean( (y-fitted(lo))^2 ) )
sqrt( mean( (y[order(x)]-low$y)^2 ) )

pred.low <- approx(low$x,low$y,xout = newx,rule = 2)
# lowess没有predict，所以我们用了linear interpolation的方法
# rule=2用的是极限值，rule=1就是返回这个值
pred.low
pred.low$y
points(newx,pred.low$y,col='red',pch=20,cex=.8)
# 这个也是不够好，因为出了范围之后呢就是直线了，这是因为approox
# 是linear interpolation，而且用的是极限值，所以说当右侧没有点
# 之后自然是一条直线

tp <- approx(lo$x,y=lo$fitted,xout = newx,rule = 2)
# 对loess用linear interpolation
tp$y
points(newx,tp$y,pch=20,col='orange')
# 这也不是最好的，毕竟都是在linear interpolation
# 所以用spline可以解决

# Using B-splines------------
library('splines')
KN <- quantile(t,c(0.15,0.40, 0.60, 0.70, 0.80, 0.90))
BM <- bs(t,knots = KN)
bl <- lm(y~BM)# 这种以后就不写了
ny = predict(BM, newx)

pred.bspline = predict(bl, newdata=as.data.frame(ny)) # nope 不匹配
# b-spline 的正确predict方式
B.spline = lm(y~.,data=data.frame(BM))# 不用lm(y~BM)，所以下次lm就写. 不写别的way
# 这样pred才能work well
pred.bspline = predict(B.spline, newdata=data.frame(ny))
pred.bspline
points(newx,pred.bspline,col='red')

# P-spline--------
rm(list = ls())
dat = na.omit(airquality)
t = dat$Temp
y = dat$Ozone
plot(t,y,xlim = c(50,110) )
newx <- c(95:100)
abline(v=newx,col='blue')
ps <- smooth.spline(t,y,cv=T)
# how to predict from the P-spline:
pred.pspline = predict(ps, x=newx)#这个x=是个argument

KN <- quantile(t,c(0.15,0.40, 0.60, 0.70, 0.80, 0.90))
BM <- bs(t,knots = KN)
b.s <- lm(y~.,data = data.frame(BM))
ny = predict(BM, newx)
pred.bspline <- predict(b.s,newdata =data.frame(ny) )

points(t, fitted(b.s), pch=20, col='brown', cex=1.5)
points(t, fitted(ps), pch=15, col='navy')
legend("topleft", legend=c("data","NLS fit","B-spline","P-spline"),
col=c(8,4,2,'navy'), pch=c(20,20,20,15), bty='n')
points(newx,pred.bspline, col='brown', pch=14)# 这几个点follow general pattern
points(pred.pspline, col=3, pch=14)

plot(t,y,xlim = c(50,110) )
points(t, fitted(b.s), pch=20, col='brown', cex=1.5)
points(t, fitted(ps), pch=15, col='navy')
# 这里P比B好

# 控制B-spline
x <- t
KN = quantile(x, c(0.1, 0.4, 0.7))
BM = bs(x, knots=KN)
KN = quantile(x, c(0.05, 0.10, 0.20, 0.23, 0.45, 0.65, 0.95))
BM = bs(x, knots=KN)
KN = quantile(x, c(0.15,0.45, 0.65, 0.95))
BM = bs(x, knots=KN)
KN = quantile(x, c(0.15,0.45, 0.65, 0.95, 0.99)) # 这个很dramastic
BM = bs(x, knots=KN)
KN = quantile(x, c(0.15,0.45, 0.65, 0.70, 0.85, 0.91)) # 这个比较合
BM = bs(x, knots=KN)
KN = quantile(x, c(0.15,0.45, 0.65, 0.70, 0.85, 0.92))
BM = bs(x, knots=KN)
KN = quantile(x, c(0.15,0.45, 0.65, 0.70, 0.85, 0.93))
BM = bs(x, knots=KN)
# konts不同那么end处的estimate是很不一样的，B-spline很难控制end

B.spline.df = lm(y~., data=as.data.frame(BM))
newBM = predict(BM, newx)
pred.bspline = predict(B.spline.df, newdata=as.data.frame(newBM))
plot(x, y, pch=20, col=8,
xlab="Temperature (degrees F)", ylab="Ozone (ppb)",
xlim=c(55,105))
points(x, fitted(B.spline.df), pch=20, col='red')
lines(newx, pred.bspline, col='navy', pch=8, t='b')

# 那么用P-spline控制
# NOTE: control degrees of freedom in P-splines

P.spline3 = smooth.spline(x, y, df=3)# df控制一个knot用多少data points
P.spline5 = smooth.spline(x, y, df=5)
P.spline10 = smooth.spline(x, y, df=10)# df越小线越平
pred3 = predict(P.spline3, x=newx)
pred5 = predict(P.spline5, x=newx)
pred10 = predict(P.spline10, x=newx)
#
plot(x, y, pch=20, col=8,
xlab="Temperature (degrees F)", ylab="Ozone (ppb)",
xlim=c(55,105))
points(x, fitted(P.spline3), pch=15, col='navy',cex=.5)
points(x, fitted(P.spline5), pch=15, col='blue',cex=.5)
points(x, fitted(P.spline10), pch=15, col='cyan',cex=.5)
points(pred3, col='navy', pch=20)
points(pred5, col='blue', pch=20)
points(pred10, col='cyan', pch=20)
# 其中有一条线与B-spline长得很像，所以控制h may be a good way:

# NOTE: control smoothness (ie penalty) parameter in P-splines
P.spline10a = smooth.spline(x, y, df=10)
P.spline10b = smooth.spline(x, y, df=10, spar=.9)# spar=smoothing parameter
P.spline10c = smooth.spline(x, y, df=10, spar=.1)
#
plot(x, y, pch=20, col=8,
xlab="Temperature (degrees F)", ylab="Ozone (ppb)",
xlim=c(55,105))
points(x, fitted(P.spline10a), pch=15, col='navy',cex=.5)# 这个是默认的
points(x, fitted(P.spline10b), pch=15, col='blue',cex=.5)# h=0.9，很平
points(x, fitted(P.spline10c), pch=15, col='red',cex=.5)# 很snesitive，h=0.1
# plot the latter as a curve (requires re-ordering the points):
is = order(x)# 就是怎么画图
xs = x[is]
lines(xs, fitted(P.spline10c)[is], pch=15, col='red')

# Q4.1----------
rm(list=ls())
setwd('/Users/apple/Desktop/22-r/ST4060/ST4060txt-csv')
library(splines)

LT <- read.table('irl_lifetable_2005.txt',header = T )
head(LT)
SLT = LT[c(1:106),]
mx = SLT[,8]
x = SLT$Age
onls = nls(mx~A+B*c^x,start=list(A=.0003, B=.00002, c=1.108))
ABc = summary(onls)$coef[,1]#这是true value，怎么也算不出来的
set.seed(1)
x = seq(0, 110, by=1)
mx = ABc[1]+ABc[2]*ABc[3]^x # this is the theorical model
mxn = mx
s1 = which(x<86)
s2 = which(x>85)
mxn[s1] = pmax(0.005,mx[s1]+rnorm(length(s1),0,.03))
mxn[s2] = mx[s2]+rnorm(length(s2),0,.06)# mxn has noise in it
dat = cbind(x,mx,mxn)
# (a)
plot(x,mx)#这是data 的true value，怎么也算不出来的
points(x,mxn,cex=0.8,pch=20)#这是加上noise的data，相当于我们的观察值，ie这是要用的东西
nls.ns = nls(mxn~A+B*c^x,start=list(A=.0003, B=.00002, c=1.108))
lines(x,fitted(nls.ns),col='red')
ABc.ns = summary(nls.ns)$coef[,1]#取nls后的系数
# (b)
ps <- smooth.spline(x,mxn)
?smooth.spline
summary(ps)
lines(x,fitted(ps),col='blue')
sy = ps$y #人家说Repeat the previous step, but on the smoothed curve obtained
#from a generic P-spline.所以才有的这一步！！！！
nls.ps = nls(sy~A+B*c^x,start=list(A=.0003, B=.00002, c=1.108))
ABc.ps = summary(nls.ps)$coef[,1]

# MSE：
(ABc-ABc.ns)^2
(ABc-ABc.ps)^2
(ABc-ABc.ns)^2-(ABc-ABc.ps)^2 #还是ps后小一点

# percentage errors：
((ABc-ABc.ns)/ABc)
((ABc-ABc.ps)/ABc)
((ABc-ABc.ns)/ABc)-((ABc-ABc.ps)/ABc) #还是ps后小一点

# 以上is a way of doing a simulation using a real life data

# 这里的启示是：当你fit a complicated model的时候，可以smooth the data (这里是p-spline)
# first to reduce the level of noise which a nonlinesr procedure would be very
# sensitive to

# Q4.2 ----------------------
rm(list = ls())
dat <- read.csv('insdata.csv')
age = dat$Age
mF = dat$mF
# (a)
sp <- smooth.spline(age,mF,cv=T)
plot(age,mF,pch=20,cex=0.8)
lines(age,fitted(sp),col='red')
# (b)
par = sp$spar
sp2 <- smooth.spline(age,mF,cv=F,spar=par/2)
lines(age,fitted(sp2),col='blue')
# spar=smoothing parameter
# df控制一个knot用多少data points
# (c)
sp$x-sp2$x # check p1$x和p2$x是一样的
# (d)
sp$x-age#也要检查
mean((mF-sp$y)^2)
mean((mF-sp2$y)^2)
sqrt(mean((mF-sp$y)^2))
sqrt(mean((mF-sp2$y)^2))
# 看数字的话第二个更好，因为RMSE更小，但是这个第二个对应的是上边那个图里
# 弯弯曲曲的蓝线，这并不是我们想要的。
# 所以要搞清楚我们这里想要的是什么，在这里我们想要一个smooth curve，而不是一个好的fit
# 所以应该是RMSE大的更好
sd(mF-sp$y) #var
sd(mF-sp2$y)#var
mean(mF-sp$y) #bias
mean(mF-sp2$y)#bias
# 可以看到bias是很小的，所以mse的大部分都是由var贡献的，所以如果apline是smoother
# 的话，mse大

# (e)
KN <- quantile(age,c(0.25,0.5,0.75))
BM <- bs(age,knots = KN)
matplot(age,BM,t='b')#不要掉了age！！！！！！x轴！！！！
# (f)
round(BM[which(age==60),],4)#这是 coordinates
abline(v=60, lty=3)
abline(h=BM[which(age==60),], lty=2)#怎么indicate
# (g)
bs <- lm(mF~.,data=data.frame(BM))
bs$coefficients
# (h)
mean((bs$residuals)^2)
mean((mF-sp$y)^2)
mean((mF-sp2$y)^2)
plot(age,mF)
lines(sp,col='red')
lines(sp2,col='blue')
lines(age,fitted(bs),col='orange')
# b-spline is very sensitive to the tail end, that's because the way of
# arranging the knots

# This B-spline seems comparable to the 1st P-spline, and
# not as effective as the second one.

# (i)
# first, see that we're missing some ages:
plot(age)
# interpolation grid:
all.ages = seq(min(age),max(age),by=1)
plot(all.ages)
lip <- approx(sp$x,sp$y,xout = all.ages)#全写上没事
sd(lip$y)

lo <- lowess(age,mF)
lilo <- approx(lo$x,lo$y,xout = all.ages)
lpred-lilo$y
sd(lilo$y)
# 问----------

plot(age,mF)
lines(all.ages,lip$y,col='red')
lines(all.ages,lpred,col='blue')

# Q4.3 ----------------------
rm(list = ls())
library('car')
library(splines)
dat <- Prestige
x = Prestige$income
y = Prestige$prestige

# B-spline:
BM <- bs(x,df=6)
bs <- lm(y~.,data = data.frame(BM))
bmse <- mean((bs$residuals)^2)

# P-spline:
ps <- smooth.spline(x,y)
ps$x-x
# For the P-spline, we need to interpolate!
range(x)
range(ps$x)
# range相同，rule=1/2都是一样的，xout的range出了x，那么就用rule=2
yp <- approx(ps$x,ps$y,xout=x)$y
yp2 <- approx(ps$x,ps$y,xout=x,rule = 2)$y
pmse <- mean((y-yp)^2)

# w11----------------
rm(list = ls())
x = iris[,c(2:4)]
y = iris[,5]
K <- 2
ko <- kmeans(x,K)
?kmeans
is <- c(1,3)
plot(x[,is],col=c(1,2,4)[ko$cluster],pch=20,cex=2)

z <- apply(x,2,scale)
(x[1,1]-mean(x$Sepal.Width))/sd(x$Sepal.Width)-z[1,1]
koz <- kmeans(z,K)
plot(x[,is],col=c(1,2,4)[koz$cluster],pch=20,cex=2)

pairs(iris[,1:4],pch=20)
cor(iris[,1:4])

dat <- EuStockMarkets
plot(EuStockMarkets, lwd=1.5)
pca <- prcomp(EuStockMarkets)
?prcomp #Principal Components Analysis
par(mfcol=c(4,1), mar=c(1,3,0,1))
# 分解four dimensional set of time series into time series
for(i in 1:4){
plot(pca$x[,i], t='l', lwd=2)
}
summary(pca)
??EuStockMarkets
names(pca)
pca$x
plot(pca)

plot(prcomp(iris[,1:4]))
plot(prcomp(iris[,1:4], scale.=TRUE))

# Logistic regression based classification--------------
rm(list = ls())
library(ISLR)
dat <- Smarket
head(Smarket)
?Smarket

glm1 <- glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=dat, family=binomial)
summary(glm1)
n <- nrow(dat)
itrain <- which(dat$Year<2005)
glm1.cv <- glm(Direction~Lag1+Lag2,
data=dat,subset = itrain, family=binomial)
summary(glm1.cv)
po <- predict(glm1.cv,type = 'response')
summary(po)
glm.pred= rep("Down", length(po))
glm.pred[po>.5] = "Up"
tb <- table(glm.pred,dat$Direction[itrain])
tb
sum(tb)
length(itrain)
1-sum(diag(tb))/sum(tb)# overall error rate

Direction.2005 <- dat$Direction[!itrain]
po.2005 <- predict(glm1.cv,type ='response',newdata = dat[-itrain,])
length(po.2005)
length(po.2005)+length(po)
nrow(dat)
glm.pred.2005= rep("Down", length(po.2005))
glm.pred.2005[po.2005>.5] = "Up"
tb.test = table(glm.pred.2005,dat$Direction[-itrain])
1-sum(diag(tb.test))/sum(tb.test)

# Logistic regression based classification
# another example using iris--------------------------------------
# 这里化简到二维问题
rm(list = ls())
iris <- iris
n <- nrow(iris)
n
x <- iris[sample(1:n,n),]

is.var <- (x$Species=='virginica') #这是在判断，返回T/F
# is.var <- which(x$Species=='virginica') #返回位置，但是我们要T/F
x$Species <- is.var
glmo <- glm(Species~., x, family=binomial)
glmo$fitted.values - predict(glmo, type='response')

# 反正就看别xy都是同一个vector就行了：
a <- x$Sepal.Length
b <- x[,-1]
lm1 <- lm(a~.,data = b)
lm2 <- lm(a~.,data = x)
lm3 <- lm(Sepal.Length~.,data = x)

n = nrow(x)
K = 10
train.acc = test.acc = numeric(K)
folds = cut(1:n, K, labels=FALSE)
k = 1
# rm(k)
thr = .8
for(k in 1:K){
itrain = which(folds!=k)
glmo = glm(Species~., data=x,
family=binomial,
subset=itrain)
tb1 = table(glmo$fitted.values>thr, x$Species[itrain])
train.acc[k] = sum(diag(tb1))/sum(tb1)
# prediction
p.test = predict(glmo, x[-itrain,], type='response')
tb2 = table(p.test>thr, x$Species[-itrain])
test.acc[k] = sum(diag(tb2))/sum(tb2)
}
mean(train.acc)
mean(test.acc)
tb1
tb2

# Illustration of hierarchical clustering ------------------------------------------------
# 分层聚类
data(eurodist)

hc = hclust(eurodist, method="ward")
plot(hc)
hc = hclust(eurodist, method="single")
plot(hc)
hc = hclust(eurodist, method="complete")
plot(hc)

# another Clustering examples
# partition-based clustering------------------------------------------------
rm(list = ls())
x = iris[,1:4]
plot(x[,1:2], pch=20)
plot(x[,1:2], pch=20, col=c(1,2,4)[iris$Species])
plot(x[,c(3,4)], pch=20,col=c(1,2,4)[iris$Species])

# hierarchical clustering
# 这里与下边无关
xs = apply(x, 2, scale)
ho = hclust(dist(x), method="ward.D")
hc = cutree(ho, k=3)
plot(x[,1:2], pch=20, col=hc)