找出两个大文件中数据不同部分

最新推荐文章于 2021-09-10 11:11:00 发布

jim8757

最新推荐文章于 2021-09-10 11:11:00 发布

阅读量1.6k

点赞数

分类专栏：海量数据处理文章标签：大数据

本文链接：https://blog.csdn.net/jim8757/article/details/84466876

版权

海量数据处理专栏收录该内容

1 篇文章 0 订阅

订阅专栏

如何找出两个大文件中的数据不同部分

问题描述：对比两个大于4G的日志文件A和B，如何高效的找出A和B中的数据不同部分。

整体思路：

第一步：文件分割；将大文件分割成多个小文件。本文采用哈希函数来分割大文件，扫描文件A，对每行字符求hash(url) % M，url是文件中的一行字符串，本文中的hash函数取JDK自带的hashCode方法，M表示分解的文件数目。根据所得的值，将url写入到对应的小文件中，如hash(url) % M = 4，则写入第四个文件中。如此，大文件A可以分为<a(0),a(1),a(2),...a(M-1)>，同理，大文件B可以分为<b(0),b(1),...b(M-1)>。可能相同的url必然都在对应的小文件中，也就是说，我们只需要寻找对应的小文件中的不同部分，然后归并所有的不同部分就是整个两个大文件的数据不同部分。

第二步：证明：可能相同的url必然都在对应的小文件中

假设X为文件A中的某行字符串，Y为文件B中的某行字符串，X.equals(Y) == true，则hash(X) == hash(Y), hash(X) % M == hash(Y) % M,则令k = hash(X) % M,则X必然在a(k)中，Y必然在b(k)中。

第三步：hash统计。统计小文件a(i)和b(i)中的不同部分，最后归并。代码如下：

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;

public class TestBigData {

	// 分割的文件数
	private static final int CUTTED_FILE_NUM = 30;
	//文件后缀名
	private static final String FILE_EXTENSIONS = ".log";
	//回车换行
	private static final String NEWLINE ="\r\n";

	/**
	 * 输入大文件的路径，根据Hash函数讲大文件分割成若干个小文件
	 * 
	 * @param sourceFilePath
	 * @param destinationFilePath
	 */
	public static void hashCutFile(String sourceFilePath,
			String destinationDirPath) {
		File fr = new File(sourceFilePath);
		BufferedReader br = null;
		BufferedWriter bw = null;
		String[] filePath = new String[CUTTED_FILE_NUM];
		for (int i = 0; i < filePath.length; i++) {
			filePath[i] = destinationDirPath + i + FILE_EXTENSIONS;
		}

		String[] split = new String[2];
		try {
			br = new BufferedReader(new FileReader(fr));
			String line = br.readLine();
			while (line != null) {
				// 数据格式为00001016114116820131725061748117041361&4580337030|||112050
				// 规范化数据
				split = line.split("\\|\\|\\|");
				String url = split[0];
				// 采用字符串自带的hashCode作为Hash函数
				int hashcode = new Integer(url.hashCode());
				int hashResult = hashcode % CUTTED_FILE_NUM;
				if (hashResult < 0) {
					hashResult = hashResult + CUTTED_FILE_NUM;
				}
				bw = new BufferedWriter(new FileWriter(new File(
						filePath[hashResult]), true));
				bw.write(url);
				bw.write(NEWLINE);
				bw.close();
				line = br.readLine();
			}
		} catch (Exception e) {
			e.printStackTrace();
		} finally {
			try {
				br.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}

	}

	/**
	 * 查找两个文件中不同的内容
	 * 
	 * @param fileA
	 * @param fileB
	 */
	public static List<String> findDifference(String fileA, String fileB) {
		List<String> partialResult = new ArrayList<String>();
		File frA = new File(fileA);
		File frB = new File(fileB);
		BufferedReader brA = null;
		BufferedReader brB = null;
		List<String> listA = new ArrayList<String>();
		List<String> listB = new ArrayList<String>();
		Set<String> hashset = new HashSet<String>();
		try {
			brA = new BufferedReader(new FileReader(frA));
			brB = new BufferedReader(new FileReader(frB));
			// 把fileA的内容读入到listA中
			String line = brA.readLine();
			while (line != null) {
				listA.add(line);
				line = brA.readLine();
			}

			line = null;
			// 把fileB的内容读入到listB中
			line = brB.readLine();
			while (line != null) {
				listB.add(line);
				line = brB.readLine();
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				brA.close();
				brB.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}

		hashset.addAll(listB);
		for (int i = 0; i < listA.size(); i++) {
			String elemA = listA.get(i);
			if (!hashset.contains(elemA)) {
				partialResult.add(elemA);
			}
		}
		hashset.clear();
		hashset.addAll(listA);
		for (int i = 0; i < listB.size(); i++) {
			String elemB = listB.get(i);
			if (!hashset.contains(elemB)) {
				partialResult.add(elemB);
			}
		}

		return partialResult;
	}
	
	/**
	 * 
	 * @param file
	 * @return
	 */
	public static List<String> findDifference(String file) {
		List<String> partialResult = new ArrayList<String>();
		File fr = new File(file);
		BufferedReader br = null;
		try {
			br = new BufferedReader(new FileReader(fr));
			// 把file的内容读入到list中
			String line = br.readLine();
			while (line != null) {
				partialResult.add(line);
				line = br.readLine();
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				br.close();
			} catch (IOException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
			}
		}
		return partialResult;
	}

	/**
	 * list1和list2求交集
	 * 
	 * @param list1
	 * @param list2
	 * @return
	 */
	public static List<String> Intersection(List<String> list1,
			List<String> list2) {
		List<String> list = new ArrayList<String>();
		Set<String> hashSet = new HashSet<String>();
		hashSet.addAll(list2);
		for (int i = 0; i < list1.size(); i++) {
			String temp = list1.get(i);
			if (hashSet.contains(list1.get(i))) {
				list.add(temp);
			}
		}
		return list;
	}

	/**
	 * 求List1与List2的差集 list1 - list2
	 * 
	 * @param list1
	 * @param list2
	 * @return
	 */
	public static List<String> Complement(List<String> list1, List<String> list2) {
		List<String> list = new ArrayList<String>();
		Set<String> hashSet = new HashSet<String>();
		hashSet.addAll(list2);
		for (int i = 0; i < list1.size(); i++) {
			String temp = list1.get(i);
			if (!hashSet.contains(temp)) {
				list.add(temp);
			}
		}
		return list;
	}

	/**
	 * 归并所有小文件中所有不相同的内容
	 * 
	 * @param dirAPath
	 *            大文件A对应的分割后的小文件目录
	 * @param dirBPath
	 *            大文件B对应的分割后的小文件目录
	 * @return
	 */
	public static Set<String> mergeDifferenceList(String dirAPath,
			String dirBPath) {
		Set<String> resultSet = new HashSet<String>();
		File dirA = new File(dirAPath);
		File dirB = new File(dirBPath);
		File[] Afiles = dirA.listFiles();
		File[] Bfiles = dirB.listFiles();
		String Afiletemp = dirA.getAbsolutePath();
		String Bfiletemp = dirB.getAbsolutePath();
		List<String> AfilesPath = new ArrayList<String>();
		List<String> BfilesPath = new ArrayList<String>();
		//存放A和B的交集结果
		List<String> intersectionList = new ArrayList<String>();
		//存放A - A与B的交集
		List<String> complementListA = new ArrayList<String>();
		//存放B - A与B的交集
		List<String> complementListB = new ArrayList<String>();
		for (int i = 0; i < Afiles.length; i++) {
			AfilesPath.add(Afiles[i].getName());
		}
		for (int i = 0; i < Bfiles.length; i++) {
			BfilesPath.add(Bfiles[i].getName());
		}
		intersectionList = Intersection(AfilesPath, BfilesPath);
		for (int i = 0; i < intersectionList.size(); i++) {
			resultSet.addAll(findDifference(Afiletemp + "\\" + intersectionList.get(i), Bfiletemp
							+ "\\" + intersectionList.get(i)));
		}

		complementListA = Complement(AfilesPath, intersectionList);
		complementListB = Complement(BfilesPath, intersectionList);
		for (int i = 0; i < complementListA.size(); i++) {
			resultSet.addAll(findDifference(Afiletemp + "\\"
					+ complementListA.get(i)));
		}

		for (int i = 0; i < complementListB.size(); i++) {
			resultSet.addAll(findDifference(Bfiletemp + "\\"
					+ complementListB.get(i)));
		}
		return resultSet;
	}

	/**
	 * @param args
	 */
	public static void main(String[] args) {
		// TODO Auto-generated method stub
		long t1 = System.currentTimeMillis();
		String fileA = "C:/Users/jim/Desktop/big data/sourceFile1.log";
		String fileB = "C:/Users/jim/Desktop/big data/sourceFile2.log";
	//	String fileA = "C:/Users/jim/Desktop/big data/新建文本文档.txt";
	//	String fileB = "C:/Users/jim/Desktop/big data/新建文本文档 (2).txt";
		String destinationA = "C:/Users/jim/Desktop/big data/destinationA/";
		String destinationB = "C:/Users/jim/Desktop/big data/destinationB/";
		Set<String> hashset = new HashSet<String>();
		hashCutFile(fileA, destinationA);
		hashCutFile(fileB, destinationB);
		hashset = mergeDifferenceList(destinationA, destinationB);
		for (Iterator<String> it = hashset.iterator(); it.hasNext();) {
			System.out.println(it.next());
		}
		long t2 = System.currentTimeMillis();
		System.out.println("时间t= " + (t2 - t1) + "ms");
	}
}

jim8757

关注

0
点赞
踩
6

收藏

觉得还不错? 一键收藏
1
评论
找出两个大文件中数据不同部分

如何找出两个大文件中的数据不同部分问题描述：对比两个大于4G的日志文件A和B，如何高效的找出A和B中的数据不同部分。整体思路：第一步：文件分割；将大文件分割成多个小文件。本文采用哈希函数来分割大文件，扫描文件A，对每行字符求hash(url) % M，url是文件中的一行字符串，本文中的hash函数取JDK自带的hashCode方法，M表示分解的文件数目。根据所得的值，将url写入...
复制链接

扫一扫

专栏目录