1.数据的导入
用于导入数据的R函数:
.使用R包自带数据
.读取csv文件:read.table和它的cousins
.不规则数据:readLines
.读取excel文件:xlsx包/read.xlsx
.读取spss文件:foreign包/read.spss
.读取sas文件:read.ssd
2.不规范数据的预处理
见代码:
#############download data from website, unzip data########
#############read data from mutiple separate files
#美国国际开发署开发政府公开的原始数据
download.file(url="http://jaredlander.com/data/US_Foreign_Aid.zip",
destfile="ForeignAid.zip")
unzip("ForeignAid.zip")
library(stringr)
dir()
theFiles=dir(pattern = "^US_Foreign_Aid")
theFiles
?regex
#loop through those files
for (a in theFiles)
{
#build a good name to assign to data
nameToUse=str_sub(string=a,start=12,end=18)
temp=read.table(a,header=TRUE,sep=",",stringsAsFactors=FALSE)
#assign them into workspace
assign(x=nameToUse,value=temp)
}
head(Aid_00s)
#readLines(): when the rows in a data files are not uniformly formatted
#step(1): Reading data
txt=readLines("data1.2.1.txt")
txt
#step(2)':Selecting lines containing data
I=grepl("^%",txt)
I
dat=txt[!I]
dat
#step(3):Split lines into separate fields
(fieldList=strsplit(dat,split=","))
#step(4):Standardize rows
assignFields=function(x){
out=character(3)
i=grepl("[[:alpha:]]",x)
out[1]=x[i]
i=which(as.numeric(x)<1890)
out[2]=ifelse(length(i)>0,x[i],NA)
i=which(as.numeric(x)>1890)
out[3]=ifelse(length(i)>0,x[i],NA)
out
}
standardFields=lapply(fieldList,assignFields) #apply a function over a list
standardFields
#step(5): transform a list to data.frame
#copy into a matrix which is then coerced into a data.frame
M=matrix(unlist(standardFields),nrow=length(standardFields),byrow=TRUE)
#unlist() produce a vector which contains all the atomic components which occur in x
colnames(M)=c("name","birth","death")
M
deltons=as.data.frame(M,stringsAsFactors=FALSE)
deltons
#step(6):Normalize and coerce to correct types
deltons$birth=as.numeric(deltons$birth)
deltons$death=as.numeric(deltons$death)
deltons
str(deltons)