面临的问题
当遇到大于2G的文件时,电脑自动的软件无法打开进行阅读,因此,我们需要将大文件分割成多个小文件进行存储;
解决方案
本文使用java来对文件进行分割,分割过程总共可以分为两步:
- 确认大文件中的行数:
public class BeforeBreakFile { public static void main(String args[]) { long start = System.currentTimeMillis(); try { FileReader read = new FileReader( "D:\\dataset_TIST2015_Checkins.txt"); BufferedReader br = new BufferedReader(read); String row; int rownum = 1; while ((row = br.readLine()) != null) { rownum ++; } System.out.println("rownum="+rownum); } catch (FileNotFoundException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } long end = System.currentTimeMillis(); long time = (end - start)/1000; System.out.println("时间:"+ time + "s"); } }
- 读取大文件,并输出多个小文件
public class InputDemo { public static void main(String[] args) throws IOException { int buffferSize = 20 * 1024* 1024; //设置读取文件缓存为20M //建立缓冲文本输入流 //文件输入地址 File file = new File("D:\\dataset_TIST2015_Checkins.txt"); BufferedReader input = new BufferedReader( new FileReader(file),buffferSize); int splitNum = 5-1;//要分割的块数减一 int fileLines = 6652727;//第一步中获得的文件行数!! long perSplitLines = fileLines / splitNum;//每个块的行数 for (int i = 0; i <= splitNum; ++i) { //分割 //每个块建立一个输出 FileWriter output = new FileWriter( "D:\\transfor" + i + "_TIST2015_Checkins.txt"); String line = null; //逐行读取,逐行输出 for (long lineCounter = 0; lineCounter < perSplitLines && (line = input.readLine()) != null; ++lineCounter) {//注意linux中换行是\n,widonws是\n\r output.append(line + "\n\r"); } output.flush(); output.close(); output = null; } } }