R笔记:
#step(1): Reading data
txt=readLines("Data_Hw2.txt") #readLines: when the rows in a data files are not uniformly formatted
txt
#step(2):Selecting lines containing data
I=grepl("^//",txt)
I
dat=txt[!I]
dat
#step(3):Split lines into separate fields
(fieldList=strsplit(dat,split=";"))
str(fieldList)
#step(4):Standardize rows
#先定义一个对列表中单个元素处理的
assignFields=function(x) #函数声明
{
l=length(x)
out=character(3)
if(l>1){
#匹配list中的字符作为输出的第一列
i=grepl("[[:alpha:]]",x)
#print(i)
out[1]=x[i]
out[2]=round(as.numeric(x[2])) #若长度不大于0,则赋值为NA
#print(i)
if(l==3){
out[3]=x[3]
}else{
out[3]=NA
}
}
#若长度不大于0,则赋值为NA
return(out)
}
#lapply函数用来处理列表的每一个元素
standardFields=lapply(fieldList,assignFields) #apply a function over a list
standardFields
#step(5): transform a list to data.frame(将list转化为data.frame)
M=matrix(unlist(standardFields),nrow=length(standardFields),byrow=TRUE) #copy into a matrix which is then coerced into a data.frame
#unlist() produce a vector which contains all the atomic components which occur in x
colnames(M)=c("Gender","Age","weight")
M
M=M[1:4,]
M
deltons=as.data.frame(M,stringsAsFactors=FALSE) #stringsAsFactors=FALSE 防止R把第一列默认成因子模式factor
deltons
#step(6):Normalize and coerce to correct types(强制转换类型)
str(deltons)
J=grepl("^m",deltons$Gender,ignore.case=T)
J
for(i in 1:length(deltons$Gender)){
deltons$Gender[i]=ifelse(J[i],"man","woman")
}
#deltons$Gender=gsub("^m","man",deltons$Gender,ignore.case=T)
deltons$weight=gsub(",",".",deltons$weight)
deltons$Age=as.integer(deltons$Age)
deltons$weight=as.numeric(deltons$weight)
deltons
str(deltons)