R 笔记:大型数据文件流读取与写入

# ------------clear existed variants------------
rm(list=ls())

START_TIME <- Sys.time()
StopWatch <- function(start_time){
	dt <- difftime(Sys.time(), start_time, units='secs')
	print(paste('Time Cost', format(.POSIXct(dt, tz="GMT"), "%H:%M:%S"), sep=': '))
}

path <- "C:/Users/Public/Data Analysis in R"
if (!file.exists(path)){
	print("Creating work directory...")
	dir.create(path)
}
setwd(path)

data_folder <- "data"
filename <- "data.csv"
filename <- paste(data_folder, filename, sep='/')

if (!file.exists(filename)){
	print('Data file does not exists.')
	quit()
}

resample <- function(mtime, btime, border){
	return (abs(as.numeric(difftime(mtime, btime), units='hours')) > border)
}

border <- 24
resample_length_first <- 0
resample_length_second <- 0
data_length <- 1

fcon <- file(filename, open='r')
line <- readLines(fcon, n=1)
print('Titles>>>')
print(strsplit(line, split=';')[[1]])
while(length(line) != 0){
	if(grepl(";", line)){
		mtime <- line
		if (data_length == 2){
			stime <- strsplit(mtime, split=';')[[1]][1]
			print(paste('startTime', stime, sep=': '))
		}
	}else{
		print("Unexpected line:")
		print(data_length)
		print(line)
	}
	line <- readLines(fcon, n=1)	
	data_length <- data_length + 1
}
close(fcon)
etime <- strsplit(mtime, split=';')[[1]][1]
print(paste('endTime', etime, sep=': '))
print(paste('Count', data_length, sep=': '))

StopWatch(START_TIME)

scon <- file(filename, open='r')
line <- readLines(scon, n=1)
while(length(line) != 0){	
	line <- readLines(scon, n=1)
	if(length(line) > 0 && grepl(";", line)){
		mtime <- strsplit(line, ';')[[1]][1]
		if(resample(mtime, stime, border)){
			resample_length_first = resample_length_first + 1
		} 
		if(resample(mtime, etime, border)){
			resample_length_second = resample_length_second + 1
		}
	}	
}
close(scon)

interval <- 1048570
resample_length <- resample_length_first + resample_length_second
interval <- ceiling((data_length - resample_length) / (interval - resample_length)) 
print(paste("Interval", interval, sep=': '))

StopWatch(START_TIME)

idx <- 1
idx_tmp <- 0
tcon <- file(filename, open='r')
d_con <- file("resample.csv", open='w')
line <- readLines(tcon, n=1)
while(length(line) != 0){	
	if(idx <= resample_length_first || idx >= data_length - resample_length_second){
		writeLines(line, d_con)
	}else{
		if(idx_tmp %% interval == 0){
			writeLines(line, d_con)
		}
		idx_tmp = idx_tmp + 1
	}
	line <- readLines(tcon, n=1)
	idx <- idx + 1
}
close(d_con)
close(tcon)

StopWatch(START_TIME)

 

1. 对于体积较大的csv文件,不仅用Microsoft excel打不开,而且在用R处理时,使用read.csv()方法也不能全部打开,所以使用R中的readLines()和writeLines()方法,减少内存消耗。

2. 对数据文件起始位置定位暂时想不到好的办法,不得不遍历两次。
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值