http://blog.csdn.net/lwl550660646/article/details/46739651
工作中遇到要将大文本(500M以上)文件切割成小文本文件,再利用多线程来提高上传效率的问题。如果直接用readLine,则效率很差。改进的方式是先按照大小来进行切分,再寻找换行符,以保证每行记录的完整性。
下面是代码部分:
import java.io.EOFException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.io.RandomAccessFile;
public class Split {
static final int byteSize = 10 * 1024 * 1024;
public void run(String originFile, String targetDirectoryPath) {
File sourceFile = new File(originFile);
File targetFile = new File(targetDirectoryPath);
if (!sourceFile.exists() || sourceFile.isDirectory()) {
return;
}
if (targetFile.exists()) {
if (!targetFile.isDirectory()) {
return;
}
} else {
targetFile.mkdirs();
}
RandomAccessFile rFile;
OutputStream os;
try {
rFile = new RandomAccessFile(originFile, "r");
long fileLength = rFile.length();
long startPos = 0;
long fileSeq = 1;
while(startPos < fileLength){
rFile.seek(startPos + byteSize);
int extra = eofOrNextCRLFInterval(rFile);
int curbyteSize = byteSize + extra;
rFile.seek(startPos);
byte[] b = new byte[curbyteSize];
int s = rFile.read(b);
os = new FileOutputStream(targetFile.getAbsolutePath() + "/" + sourceFile.getName().replaceAll("[.][^.]+$", "") +"_" + fileSeq +".txt");
os.write(b, 0, s);
os.flush();
os.close();
startPos += curbyteSize;
fileSeq ++;
}
} catch (IOException e) {
e.printStackTrace();
}
}
public int eofOrNextCRLFInterval(RandomAccessFile rFile) throws IOException{
boolean isCRLF = false;
int interval = 0;
while(!isCRLF){
try{
interval ++;
int readByte = rFile.readByte();
if(readByte == 0X0A){
isCRLF = true;
}
}catch(EOFException e){
isCRLF = true;
break;
}
}
return interval;
}
public static void main(String[] args){
String sourceFile = "D:\\Split\\posinv\\posinv.txt";
String targetFilePath = "D:\\Split\\posinv";
Split s = new Split();
long start1 = System.currentTimeMillis();
System.out.println(start1);
s.run(sourceFile, targetFilePath);
long start2 = System.currentTimeMillis();
System.out.println(start2);
System.out.println((start2 - start1)/1000.00 + " second");
}
}
- 1
- 2
- 3
- 4
- 5
- 6
- 7
- 8
- 9
- 10
- 11
- 12
- 13
- 14
- 15
- 16
- 17
- 18
- 19
- 20
- 21
- 22
- 23
- 24
- 25
- 26
- 27
- 28
- 29
- 30
- 31
- 32
- 33
- 34
- 35
- 36
- 37
- 38
- 39
- 40
- 41
- 42
- 43
- 44
- 45
- 46
- 47
- 48
- 49
- 50
- 51
- 52
- 53
- 54
- 55
- 56
- 57
- 58
- 59
- 60
- 61
- 62
- 63
- 64
- 65
- 66
- 67
- 68
- 69
- 70
- 71
- 72
- 73
- 74
- 75
- 76
- 77
- 78
- 79
- 80
- 81
- 82
- 83
- 84
测试文件posinv.txt大小为400多M,数据量是500w+条。
运行结果:
1435820829795
1435820831057
1.262 second