两个上亿行的大文件取交集

前两天看到一哥们百度面试归来后发的帖子,并分享了其百度面试题,其中有一个题大意如下:
现有两个上亿行的文件,每一行都只有一个数字,求这两个文件的交集。

我的思路如下:首先是分别对这两个文件排序,然后,再同时遍历这两个文件。求出交集即可。
下面我对大文件的排序进行了简单的实现。
基本思路如下,首先对大文件进行拆分,一亿行的大文件可以拆分成10个小文件,并分别对这10个小文件进行排序,之后对这10个有序的小文件进行遍历,比较,再合并成一个大文件即可完成排序。
里面文件的路径为D://bigfile

代码如下。
main函数在bigFile中。

package com.fly.lee.bigfile;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.ArrayList;
import java.util.Collections;
import java.util.List;

public class BigFile {
private List<String> subFileNameList = new ArrayList<String>();
private String bigFilePath = "D:/bigfile/";

private String bigFileName = "Source1.txt";
private FileGenerator fGenerator = new FileGenerator();

private static final int theMaxNumber = 10000000;
private List<Integer> theContentWritingToSubFile = new ArrayList<Integer>(theMaxNumber);

public void sortTheBigfile(String destination){
splitTheBigFile();
SourceFolder sFolder = new SourceFolder();
sFolder.setFilenameList(subFileNameList);
sFolder.setDestination(destination);
try {
sFolder.compareFiles();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
removeTheSubFiles();
}

public void setBigFilePath(String bigFilePath) {
this.bigFilePath = bigFilePath;
}

public void setBigFileName(String bigFileName) {
this.bigFileName = bigFileName;
}

private void removeTheSubFiles() {
for(String fileName:subFileNameList){
File file = new File(fileName);
if(file.exists()&&file.canWrite()){
System.out.println(fileName+":"+file.delete());;
}
}
}

public void splitTheBigFile(){
System.out.println("begin to spliting the big files...");
int counter = 0;
File file = new File(bigFilePath+bigFileName);
BufferedReader reader = null;
FileReader fReader;
int fileFlag = 1;
try {
fReader = new FileReader(file);
reader = new BufferedReader(fReader);
String tempString = null;
while ((tempString = reader.readLine()) != null) {
if(tempString.trim().equals("")){
break;
}
int tempInt = Integer.valueOf(tempString);
theContentWritingToSubFile.add(tempInt);
if(isTheListFull(counter)){
handleTheFullList(fileFlag);
theContentWritingToSubFile.clear();
fileFlag ++;
}
}
handleTheFullList(fileFlag);
fReader.close();
reader.close();
System.out.println("finishing the spliting work.");
}catch(Exception e){
e.printStackTrace();
}
}

private void handleTheFullList(int fileFlag) throws Exception {
System.out.println("handle the full list...");
String tempFilePath = bigFilePath + fileFlag + ".txt";
writeTheContentToSubFile(tempFilePath);
subFileNameList.add(tempFilePath);
theContentWritingToSubFile.clear();
System.out.println("the full list is clear now...");
}

private void writeTheContentToSubFile(String tempFilePath) throws Exception {
System.out.println("begin to write the content to sub file...");
System.out.println("sub file path:"+tempFilePath);
Collections.sort(theContentWritingToSubFile);
fGenerator.setFileName(tempFilePath);
fGenerator.setTheListNeedToWriteTofile(theContentWritingToSubFile);
fGenerator.writeToFileFromTheList(true);
System.out.println("finishing this loop of writing the content to sub file...");
}

private boolean isTheListFull(int counter) {
return theContentWritingToSubFile.size() >= theMaxNumber;
}

public static void main(String[] args) {
BigFile bf = new BigFile();
bf.setBigFileName("Source1.txt");
bf.sortTheBigfile("D:/bigfile/Source1_sorted.txt");
}
}


SourceFolder.java: 主要负责对文件夹下拆分后的小文件进行比较并合并成一个新的有序文件。

package com.fly.lee.bigfile;

import java.io.BufferedReader;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class SourceFolder {
private List<String> filenameList;
private Map<Integer, Integer> numberCache;
private List<BufferedReader> bufferedReaderList;
private int endFlagNumber = 0;
private List<Integer> contentListWritingToFile;
private FileGenerator fileGenerator;
private String destination = "D:/bigfile/AllSource.txt";


public SourceFolder() {
contentListWritingToFile = new ArrayList<Integer>();
filenameList = new ArrayList<String>();
bufferedReaderList = new ArrayList<BufferedReader>();
fileGenerator = new FileGenerator();
}

public void setDestination(String destination) {
this.destination = destination;
}
public void addFile(String filename) {
this.filenameList.add(filename);
}
public void setFilenameList(List<String> filenameList) {
this.filenameList = filenameList;
}

public void compareFiles() throws Exception {
System.out.println("filenameList:"+filenameList);
initTheBufferedReaderList();
initTheNumberCache();
while(!isAllFilesFinishTheComparing()){
int theIndexOfReaderNeedingToMove = findTheLastIndexOfTheMinNumber();
addTheNumberToFile(theIndexOfReaderNeedingToMove);
updateNumberCache(theIndexOfReaderNeedingToMove);
}
addTheLastListToFile();
closeAllIOStreams();
}

private void closeAllIOStreams() throws IOException {

for(BufferedReader bReader:bufferedReaderList){
if(bReader != null){
bReader.close();
}
}

}

private int findTheLastIndexOfTheMinNumber() {
int theIndexOfTheMinNumber = 0;
int mixNumber = getTheFirstNumberFromTheCache();
for (int index = 0; index < numberCache.size(); index++) {
if(numberCache.get(index) == null){
continue;
}
int theNumberWillBeCompared = numberCache.get(index);
if (mixNumber >= theNumberWillBeCompared) {
mixNumber = theNumberWillBeCompared;
theIndexOfTheMinNumber = index;
}
}
return theIndexOfTheMinNumber;
}

private int getTheFirstNumberFromTheCache() {
for (int index = 0; index < numberCache.size(); index++) {
if(numberCache.get(index) == null){
continue;
}
return numberCache.get(index);
}
return 0;
}

private void addTheNumberToFile(int theIndexOfReaderNeedingToMove) throws Exception {
contentListWritingToFile.add(numberCache.get(theIndexOfReaderNeedingToMove));
if(contentListWritingToFile.size() == 1000000){
fileGenerator.setTheListNeedToWriteTofile(contentListWritingToFile);
fileGenerator.setFileName(destination);
fileGenerator.writeToFileFromTheList( false);
contentListWritingToFile.clear();
}
}

private void updateNumberCache(int index) throws Exception {
BufferedReader bufferedReader = bufferedReaderList.get(index);
String tempString = null;
if ((tempString = bufferedReader.readLine()) != null) {
if (!"".equals(tempString.trim())) {
int tempInt = Integer.valueOf(tempString);
numberCache.put(index, tempInt);
}
} else {
numberCache.put(index, null);
endFlagNumber ++;
}
}

private void addTheLastListToFile() throws Exception {
fileGenerator.setTheListNeedToWriteTofile(contentListWritingToFile);
fileGenerator.setFileName(destination);
fileGenerator.writeToFileFromTheList(false);
}



private void initTheBufferedReaderList() throws FileNotFoundException {
System.out.println("begin to initial the buffered reader...");
for (String filename : filenameList) {
BufferedReader bufferedReader = new BufferedReader(new FileReader(
filename));
bufferedReaderList.add(bufferedReader);
}
System.out.println("finish initialing the buffered reader...");
}

private void initTheNumberCache() throws Exception {
System.out.println("begin to initial the number cache...");
numberCache = new HashMap<Integer, Integer>(filenameList.size());
for (int index = 0; index < filenameList.size(); index++) {
updateNumberCache(index);
}
System.out.println("finish initialing the number cache...");
}

private boolean isAllFilesFinishTheComparing() {

return endFlagNumber == filenameList.size();
}
}

FileGenerator.java主要是负责将生成好的链表写入到一指定的文件中,可以选择是覆盖文件或者是在文件后添加。里面也有生成上亿行文件的方法(测试时可以事先用此方法生成大文件。 : ) ).

package com.fly.lee.bigfile;

import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class FileGenerator {
private int wholeLineNumber = 100000000;

private List<Integer> theListNeedToWriteTofile;
private String fileName;
private FileWriter fWriter = null;

// public void write
public static void main(String[] args) {
String fileName = "D:/bigfile/Source_yiwan.txt";
FileGenerator fileGenerator = new FileGenerator();
fileGenerator.setFileName(fileName);
try {
fileGenerator.createRandomFile();
fileGenerator.closeFileWriter();
} catch (Exception e) {
e.printStackTrace();
}
}

public void setFileName(String fileName) {
this.fileName = fileName;
}

public void setWholeLineNumber(int wholeLineNumber) {
this.wholeLineNumber = wholeLineNumber;
}

public void setTheListNeedToWriteTofile(List<Integer> theListNeedToWriteTofile) {
this.theListNeedToWriteTofile = theListNeedToWriteTofile;
}
public void createRandomFile() throws Exception {
int listSize = 10000000;
theListNeedToWriteTofile = new ArrayList<Integer>(listSize);
for(int i = 0; i < wholeLineNumber; i ++){
int tempRandomInt = (int)(Math.random()*100000000);
theListNeedToWriteTofile.add(tempRandomInt);
if(theListNeedToWriteTofile.size()==listSize){
System.out.println("begin to write to the file...");
writeToFileFromTheList(false);
theListNeedToWriteTofile.clear();
System.out.println("finish this loop...");
}
}
writeToFileFromTheList(false);
}

public void writeToFileFromTheList(boolean isNeedToCreateNewFile) throws Exception {
System.out.println("write the content to file...");
try {
if(isNeedToCreateNewFile){
createNewFile(fileName);
}
StringBuilder contentWritingToFile = new StringBuilder();
int cycleLineNumber = 1000000;
int counter = 0;
fWriter = new FileWriter(fileName,true);
for(int index = 0; index < theListNeedToWriteTofile.size(); index ++){
int tempRandomInt = theListNeedToWriteTofile.get(index);
contentWritingToFile.append(tempRandomInt+"\n");
if(counter == cycleLineNumber){
fWriter.append(contentWritingToFile.toString());
counter = 0;
contentWritingToFile = new StringBuilder();
}
}
// while(theListNeedToWriteTofile.size() != 0){
// int tempRandomInt = theListNeedToWriteTofile.remove(0);
// contentWritingToFile.append(tempRandomInt+"\n");
// if(counter == cycleLineNumber){
// fWriter.append(contentWritingToFile.toString());
// counter = 0;
// contentWritingToFile = new StringBuilder();
// }
// }
fWriter.append(contentWritingToFile.toString());
System.out.println("done..........");
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
} finally{
closeFileWriter();
}
}

private void closeFileWriter() throws IOException{
if(fWriter != null){
fWriter.close();
}
}
private void createNewFile(String fileName) throws IOException {
File file = new File(fileName);
if(file.delete()){
file.createNewFile();
}
}
}

至此,完成了大文件的排序。至于下一步就是对着两个有序的大文件进行遍历比较了,
1.两个索引分别指向两个大文件的第一行。
2.如果两个索引所指的数据一样,记录当前数据到新的文件,并将两个指针分别下移一位。
如果二者不相等,则将数据较小的指针下移一位。
3.判断是否到其中一个文件的结尾,如果是,结束比较,否则重复步骤2。
这样就可以取出二者的交集,代码到此结束。
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值