如何找出两个大文件中的数据不同部分
问题描述:对比两个大于4G的日志文件A和B,如何高效的找出A和B中的数据不同部分。
整体思路:
第一步:文件分割;将大文件分割成多个小文件。本文采用哈希函数来分割大文件,扫描文件A,对每行字符求hash(url) % M,url是文件中的一行字符串,本文中的hash函数取JDK自带的hashCode方法,M表示分解的文件数目。根据所得的值,将url写入到对应的小文件中,如hash(url) % M = 4,则写入第四个文件中。如此,大文件A可以分为<a(0),a(1),a(2),...a(M-1)>,同理,大文件B可以分为<b(0),b(1),...b(M-1)>。可能相同的url必然都在对应的小文件中,也就是说,我们只需要寻找对应的小文件中的不同部分,然后归并所有的不同部分就是整个两个大文件的数据不同部分。
第二步:证明:可能相同的url必然都在对应的小文件中
假设X为文件A中的某行字符串,Y为文件B中的某行字符串,X.equals(Y) == true,则hash(X) == hash(Y), hash(X) % M == hash(Y) % M,则令k = hash(X) % M,则X必然在a(k)中,Y必然在b(k)中。
第三步:hash统计。统计小文件a(i)和b(i)中的不同部分,最后归并。代码如下:
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
public class TestBigData {
// 分割的文件数
private static final int CUTTED_FILE_NUM = 30;
//文件后缀名
private static final String FILE_EXTENSIONS = ".log";
//回车换行
private static final String NEWLINE ="\r\n";
/**
* 输入大文件的路径,根据Hash函数讲大文件分割成若干个小文件
*
* @param sourceFilePath
* @param destinationFilePath
*/
public static void hashCutFile(String sourceFilePath,
String destinationDirPath) {
File fr = new File(sourceFilePath);
BufferedReader br = null;
BufferedWriter bw = null;
String[] filePath = new String[CUTTED_FILE_NUM];
for (int i = 0; i < filePath.length; i++) {
filePath[i] = destinationDirPath + i + FILE_EXTENSIONS;
}
String[] split = new String[2];
try {
br = new BufferedReader(new FileReader(fr));
String line = br.readLine();
while (line != null) {
// 数据格式为00001016114116820131725061748117041361&4580337030|||112050
// 规范化数据
split = line.split("\\|\\|\\|");
String url = split[0];
// 采用字符串自带的hashCode作为Hash函数
int hashcode = new Integer(url.hashCode());
int hashResult = hashcode % CUTTED_FILE_NUM;
if (hashResult < 0) {
hashResult = hashResult + CUTTED_FILE_NUM;
}
bw = new BufferedWriter(new FileWriter(new File(
filePath[hashResult]), true));
bw.write(url);
bw.write(NEWLINE);
bw.close();
line = br.readLine();
}
} catch (Exception e) {
e.printStackTrace();
} finally {
try {
br.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}
/**
* 查找两个文件中不同的内容
*
* @param fileA
* @param fileB
*/
public static List<String> findDifference(String fileA, String fileB) {
List<String> partialResult = new ArrayList<String>();
File frA = new File(fileA);
File frB = new File(fileB);
BufferedReader brA = null;
BufferedReader brB = null;
List<String> listA = new ArrayList<String>();
List<String> listB = new ArrayList<String>();
Set<String> hashset = new HashSet<String>();
try {
brA = new BufferedReader(new FileReader(frA));
brB = new BufferedReader(new FileReader(frB));
// 把fileA的内容读入到listA中
String line = brA.readLine();
while (line != null) {
listA.add(line);
line = brA.readLine();
}
line = null;
// 把fileB的内容读入到listB中
line = brB.readLine();
while (line != null) {
listB.add(line);
line = brB.readLine();
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
brA.close();
brB.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
hashset.addAll(listB);
for (int i = 0; i < listA.size(); i++) {
String elemA = listA.get(i);
if (!hashset.contains(elemA)) {
partialResult.add(elemA);
}
}
hashset.clear();
hashset.addAll(listA);
for (int i = 0; i < listB.size(); i++) {
String elemB = listB.get(i);
if (!hashset.contains(elemB)) {
partialResult.add(elemB);
}
}
return partialResult;
}
/**
*
* @param file
* @return
*/
public static List<String> findDifference(String file) {
List<String> partialResult = new ArrayList<String>();
File fr = new File(file);
BufferedReader br = null;
try {
br = new BufferedReader(new FileReader(fr));
// 把file的内容读入到list中
String line = br.readLine();
while (line != null) {
partialResult.add(line);
line = br.readLine();
}
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
br.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
return partialResult;
}
/**
* list1和list2求交集
*
* @param list1
* @param list2
* @return
*/
public static List<String> Intersection(List<String> list1,
List<String> list2) {
List<String> list = new ArrayList<String>();
Set<String> hashSet = new HashSet<String>();
hashSet.addAll(list2);
for (int i = 0; i < list1.size(); i++) {
String temp = list1.get(i);
if (hashSet.contains(list1.get(i))) {
list.add(temp);
}
}
return list;
}
/**
* 求List1与List2的差集 list1 - list2
*
* @param list1
* @param list2
* @return
*/
public static List<String> Complement(List<String> list1, List<String> list2) {
List<String> list = new ArrayList<String>();
Set<String> hashSet = new HashSet<String>();
hashSet.addAll(list2);
for (int i = 0; i < list1.size(); i++) {
String temp = list1.get(i);
if (!hashSet.contains(temp)) {
list.add(temp);
}
}
return list;
}
/**
* 归并所有小文件中所有不相同的内容
*
* @param dirAPath
* 大文件A对应的分割后的小文件目录
* @param dirBPath
* 大文件B对应的分割后的小文件目录
* @return
*/
public static Set<String> mergeDifferenceList(String dirAPath,
String dirBPath) {
Set<String> resultSet = new HashSet<String>();
File dirA = new File(dirAPath);
File dirB = new File(dirBPath);
File[] Afiles = dirA.listFiles();
File[] Bfiles = dirB.listFiles();
String Afiletemp = dirA.getAbsolutePath();
String Bfiletemp = dirB.getAbsolutePath();
List<String> AfilesPath = new ArrayList<String>();
List<String> BfilesPath = new ArrayList<String>();
//存放A和B的交集结果
List<String> intersectionList = new ArrayList<String>();
//存放A - A与B的交集
List<String> complementListA = new ArrayList<String>();
//存放B - A与B的交集
List<String> complementListB = new ArrayList<String>();
for (int i = 0; i < Afiles.length; i++) {
AfilesPath.add(Afiles[i].getName());
}
for (int i = 0; i < Bfiles.length; i++) {
BfilesPath.add(Bfiles[i].getName());
}
intersectionList = Intersection(AfilesPath, BfilesPath);
for (int i = 0; i < intersectionList.size(); i++) {
resultSet.addAll(findDifference(Afiletemp + "\\" + intersectionList.get(i), Bfiletemp
+ "\\" + intersectionList.get(i)));
}
complementListA = Complement(AfilesPath, intersectionList);
complementListB = Complement(BfilesPath, intersectionList);
for (int i = 0; i < complementListA.size(); i++) {
resultSet.addAll(findDifference(Afiletemp + "\\"
+ complementListA.get(i)));
}
for (int i = 0; i < complementListB.size(); i++) {
resultSet.addAll(findDifference(Bfiletemp + "\\"
+ complementListB.get(i)));
}
return resultSet;
}
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
long t1 = System.currentTimeMillis();
String fileA = "C:/Users/jim/Desktop/big data/sourceFile1.log";
String fileB = "C:/Users/jim/Desktop/big data/sourceFile2.log";
// String fileA = "C:/Users/jim/Desktop/big data/新建文本文档.txt";
// String fileB = "C:/Users/jim/Desktop/big data/新建文本文档 (2).txt";
String destinationA = "C:/Users/jim/Desktop/big data/destinationA/";
String destinationB = "C:/Users/jim/Desktop/big data/destinationB/";
Set<String> hashset = new HashSet<String>();
hashCutFile(fileA, destinationA);
hashCutFile(fileB, destinationB);
hashset = mergeDifferenceList(destinationA, destinationB);
for (Iterator<String> it = hashset.iterator(); it.hasNext();) {
System.out.println(it.next());
}
long t2 = System.currentTimeMillis();
System.out.println("时间t= " + (t2 - t1) + "ms");
}
}