第7周作业 相关性借补,knn插补

#install.packages('DMwR')
library(DMwR)
algae <- read.table('Analysis.txt',header=F, dec='.',
                    col.names=c('season','size','speed','mxPH','mnO2','Cl','NO3','NH4','oPO4','PO4','Chla','a1','a2','a3','a4','a5','a6','a7'),
                    na.strings=c('XXXXXXX'),stringsAsFactors=F)
head(algae)

1、利用变量间的相关性借补时,由于会发生最相关的变量对应位置处的观测值也恰好缺失的情形,所以请将这样的情形也考虑在内,即考虑用能借补的那些变量中相关性最大的那个即可。

library(reshape)
#cor
cor <- cor(algae[,4:18], use = "complete.obs")
#reshape 
meltCor <- melt(cor)
#tolower
names(meltCor) <- c('x1','x2','value')
#kickout cor==1
meltCor <- meltCor[-which(meltCor[,'value']==1),]
#find which cases are lost
NAalgae <- algae[!complete.cases(algae),];head(NAalgae)
#find which cases are complete
comAlgae <- algae[complete.cases(algae),]
#calculate coef a b 这个函数用来产生lmdata数据框中所有组合的coef参数
lmCoef <- function(lmData){
        coefData <- as.data.frame(matrix(NA,nrow(lmData),5))
        for (i in 1:nrow(lmData)){
                formula <- paste(lmData[i,1],'~',lmData[i,2],sep='')
                b = coef(lm(formula, data=comAlgae))[1]
                a = coef(lm(formula, data=comAlgae))[2]
                coefData[i,] <-cbind(as.character(lmData[i,1]),as.character(lmData[i,2]),lmData[i,3],a,b)  
                }
        names(coefData) <- c('x1','x2','cor','a','b')
        return(coefData)
        }
#create lists to store the ordered cor and coef data by different elements
corList <- list()
corList <- lapply(unique(meltCor[,'x1']), 
                  function(element){
                          df <- subset(meltCor, x1==element)
                          df <- df[order(abs(df$value),decreasing = T),]
                          element <- lmCoef(df)
                          })
names(corList) <- unique(meltCor[,'x1'])

corList[[1]]
#input lmValue to NA
for (i in 1:nrow(NAalgae)){
        NAnames <- names(NAalgae)[is.na(NAalgae[i,])]
        completeNames <- names(NAalgae)[!is.na(NAalgae[i,])]
        for (name in NAnames){
                #find a not NA valuename from corList
                k=1
                CoefName =corList[[name]][k,'x2']
                while (sum(which(completeNames==CoefName))==0){
                        k=k+1
                        CoefName =corList[[name]][k,'x2']

                }
                df <- corList[[name]]
                a <- as.numeric(df[which(df$x2==CoefName),'a'])
                b <- as.numeric(df[which(df$x2==CoefName),'b'])
                NAalgae[i,name] <- NAalgae[i,CoefName]*a+b
        }
}

#complete Naalgae!
head(NAalgae)

2 在最近邻( KNN )方法中,采用只考虑借助于没有缺失的案例(Cases)的情形。(这题做的不好)

data(algae)
#find which cases are lost
NAalgae <- algae[!complete.cases(algae),];NAalgae
#find which cases are complete
comAlgae <- algae[complete.cases(algae),]
#put One NAalgae into comAlgae 
for (i in 1:nrow(NAalgae)){
        oneNAalgae <- rbind(NAalgae[i,],comAlgae)
        NAalgae[i,] <- knnImputation(oneNAalgae, k = 10)[1,]
 }
head(NAalgae)
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 2
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值