数据预处理demo

最新推荐文章于 2021-11-12 22:27:02 发布

妄念驱动

最新推荐文章于 2021-11-12 22:27:02 发布

阅读量428

点赞数

分类专栏： R 文章标签： R

本文链接：https://blog.csdn.net/hx2017/article/details/77770745

版权

R 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

R笔记：

#step(1): Reading data
txt=readLines("Data_Hw2.txt")  #readLines: when the rows in a data files are not uniformly formatted
txt
#step(2):Selecting lines containing data
I=grepl("^//",txt)
I
dat=txt[!I]
dat
#step(3):Split lines into separate fields
(fieldList=strsplit(dat,split=";"))
str(fieldList)
#step(4):Standardize rows
#先定义一个对列表中单个元素处理的
assignFields=function(x)  #函数声明
{
  l=length(x)
  out=character(3)
  if(l>1){
  #匹配list中的字符作为输出的第一列
  i=grepl("[[:alpha:]]",x)
  #print(i)
  out[1]=x[i]
  out[2]=round(as.numeric(x[2]))  #若长度不大于0，则赋值为NA
  #print(i)
  if(l==3){
    out[3]=x[3]
  }else{
    out[3]=NA
  }
  }
    #若长度不大于0，则赋值为NA
  return(out)
}
#lapply函数用来处理列表的每一个元素
standardFields=lapply(fieldList,assignFields) #apply a function over a list
standardFields
#step(5): transform a list to data.frame（将list转化为data.frame）
M=matrix(unlist(standardFields),nrow=length(standardFields),byrow=TRUE)  #copy into a matrix which is then coerced into a data.frame
#unlist() produce a vector which contains all the atomic components which occur in x
colnames(M)=c("Gender","Age","weight")
M
M=M[1:4,]
M
deltons=as.data.frame(M,stringsAsFactors=FALSE)  #stringsAsFactors=FALSE 防止R把第一列默认成因子模式factor
deltons
#step(6):Normalize and coerce to correct types（强制转换类型）
str(deltons)
J=grepl("^m",deltons$Gender,ignore.case=T)
J
for(i in 1:length(deltons$Gender)){
  deltons$Gender[i]=ifelse(J[i],"man","woman")
}
#deltons$Gender=gsub("^m","man",deltons$Gender,ignore.case=T)
deltons$weight=gsub(",",".",deltons$weight)
deltons$Age=as.integer(deltons$Age)
deltons$weight=as.numeric(deltons$weight)
deltons
str(deltons)