给定1亿int,找出最大的100个

给定1亿个数,找出 最大的 100个

1. 用一个长度是 101 的数组,建立 小顶堆(0号元素不用,主要是为了使用堆的性质:父结点i,则,左右 子结点 是 2i 和 2i+1)

2. 用堆顶 和 每个 取得的数 进行比较。(a. 堆顶 >= 取得的数,则,忽略 取得的数 b. 否则,把堆顶 替换为 取得的数)

3. 新得到的堆, 堆顶 的左右子树 都是 完美堆。需要调整 堆顶(调整算法 就是 构建堆时的 调整算法)


其实,无论是1亿,还是 几亿个,都无所谓,因为,耗费的内存的就是 长度101 的数组。读取1亿个数,就像流式读取文件一样,并不会 完全读入内存中

另外,可以使用 linux命令来产生随机文件,然后,每次读取4个字节,这样,其实就是读取了一个整数,命令是这样的

// 当前目录下,产生 512M 随机数据 二进制文件
dd if=/dev/urandom of=random.dat bs=1M count=512

程序的完整代码如下

import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.util.Arrays;

import com.util.CommUtil;

public class TopN {
	// Find the top n number from more than 100,000,000 numbers
	public static int[] topN(int n, GenerateInt gi) {
		// please note: 0 element is ignored
		int[] arrayTopN = new int[n + 1];
		arrayTopN[0] = Integer.MAX_VALUE;
		int i = 1;
		while (gi.hasNext() && (i <= arrayTopN.length - 1)) {
			arrayTopN[i] = gi.next();
			i++;
		}
		Heap.constructSmallRootHeap(arrayTopN);

		while (gi.hasNext()) {
			insertNewNumber(gi.next(), arrayTopN);
		}
		return arrayTopN;
	}

	private static void insertNewNumber(int newNumber, int[] smallRootHeap) {
		// too small, just ignore the newNumber
		if (smallRootHeap[1] >= newNumber) {
			return;
		}
		// newNumber > smallRootHeap[1]
		smallRootHeap[1] = newNumber;
		Heap.sinkRoot(1, smallRootHeap);
	}

	public static void test_TopN(int n, String fullPath) throws Exception {
		long begin = System.currentTimeMillis();

		FileInputStream fis = new FileInputStream(fullPath);
		int[] arrayTopN = topN(n, new GenerateInt(fis));
		fis.close();
		System.out.println(Arrays.toString(arrayTopN));

		System.out.printf("TopN cost millionseconds: %d",
				(System.currentTimeMillis() - begin));
	}

	public static void main(String[] args) throws Exception {
		String fullPath = "/home/marvin/random.dat";
		test_TopN(10, fullPath);
	}

}

class Heap {
	// Please note: 0 element is ignored
	public static void constructSmallRootHeap(int[] orgnArray) {
		// 0 element is not included
		int validElementCount = orgnArray.length - 1;
		int lastNonleaf = validElementCount / 2;

		int curIndex = lastNonleaf;
		while (curIndex >= 1) {
			sinkRoot(curIndex, orgnArray);
			curIndex--;
		}
	}

	public static void sinkRoot(int rootIndex, int[] smallRootHeap) {
		int lastIndex = smallRootHeap.length - 1;
		int curIndex = rootIndex;
		int left = 2 * curIndex;
		int right = left + 1;
		while (left <= lastIndex) {
			int minValueIndex = left;
			// right child exists
			if (right <= lastIndex) {
				minValueIndex = (smallRootHeap[left] > smallRootHeap[right]) ? right
						: left;
			}
			// no right child at all
			else {
				minValueIndex = left;
			}

			// parent is bigger. should down
			if (smallRootHeap[curIndex] > smallRootHeap[minValueIndex]) {
				CommUtil.swap(curIndex, minValueIndex, smallRootHeap);

				curIndex = minValueIndex;
				left = 2 * curIndex;
				right = left + 1;
			}
			// parent is smaller. good, we've done
			else {
				break;
			}
		}
	}

	public static void main(String[] args) {
		int[] orgnArray = { Integer.MAX_VALUE, 49, 38, 65, 97, 76, 13, 27, 49 };
		System.out.printf("Before: %s\n", Arrays.toString(orgnArray));
		constructSmallRootHeap(orgnArray);
		System.out.printf("After : %s\n", Arrays.toString(orgnArray));
	}
}

class GenerateInt {
	// To generate a random file, you can use the following linux command
	// dd if=/dev/urandom of=random.dat bs=1M count=512
	private DataInputStream dis;
	private int i;
	private long totalCount = 0;

	public GenerateInt(FileInputStream fis) {
		try {
			this.dis = new DataInputStream(new BufferedInputStream(fis));
		} catch (Exception e) {
			throw new RuntimeException(e);
		}
	}

	public boolean hasNext() {
		try {
			i = dis.readInt();
			totalCount++;
		} catch (Exception e) {
			System.out.println("Generate int count: " + totalCount);
			return false;
		}
		return true;
	}

	public int next() {
		return i;
	}

	private static void readIntFromFile(String fullPath) throws Exception {
		FileInputStream fis = new FileInputStream(fullPath);
		long begin = System.currentTimeMillis();
		GenerateInt gi = new GenerateInt(fis);
		while (gi.hasNext()) {
			// System.out.println(gi.next());
		}
		System.out.printf("Read int cost: %d",
				(System.currentTimeMillis() - begin));
		fis.close();
	}

	public static void main(String[] args) throws Exception {
		String randomFile = "/home/marvin/random.dat";
		readIntFromFile(randomFile);
	}
}


还有一种是找出 1亿个数中,出现次数最多的 100 个数 ,这种可以通过 trie树来查找



阅读更多

没有更多推荐了,返回首页