给定1亿个数,找出 最大的 100个
1. 用一个长度是 101 的数组,建立 小顶堆(0号元素不用,主要是为了使用堆的性质:父结点i,则,左右 子结点 是 2i 和 2i+1)
2. 用堆顶 和 每个 取得的数 进行比较。(a. 堆顶 >= 取得的数,则,忽略 取得的数 b. 否则,把堆顶 替换为 取得的数)
3. 新得到的堆, 堆顶 的左右子树 都是 完美堆。需要调整 堆顶(调整算法 就是 构建堆时的 调整算法)
其实,无论是1亿,还是 几亿个,都无所谓,因为,耗费的内存的就是 长度101 的数组。读取1亿个数,就像流式读取文件一样,并不会 完全读入内存中
另外,可以使用 linux命令来产生随机文件,然后,每次读取4个字节,这样,其实就是读取了一个整数,命令是这样的
// 当前目录下,产生 512M 随机数据 二进制文件
dd if=/dev/urandom of=random.dat bs=1M count=512
程序的完整代码如下
import java.io.BufferedInputStream;
import java.io.DataInputStream;
import java.io.FileInputStream;
import java.util.Arrays;
import com.util.CommUtil;
public class TopN {
// Find the top n number from more than 100,000,000 numbers
public static int[] topN(int n, GenerateInt gi) {
// please note: 0 element is ignored
int[] arrayTopN = new int[n + 1];
arrayTopN[0] = Integer.MAX_VALUE;
int i = 1;
while (gi.hasNext() && (i <= arrayTopN.length - 1)) {
arrayTopN[i] = gi.next();
i++;
}
Heap.constructSmallRootHeap(arrayTopN);
while (gi.hasNext()) {
insertNewNumber(gi.next(), arrayTopN);
}
return arrayTopN;
}
private static void insertNewNumber(int newNumber, int[] smallRootHeap) {
// too small, just ignore the newNumber
if (smallRootHeap[1] >= newNumber) {
return;
}
// newNumber > smallRootHeap[1]
smallRootHeap[1] = newNumber;
Heap.sinkRoot(1, smallRootHeap);
}
public static void test_TopN(int n, String fullPath) throws Exception {
long begin = System.currentTimeMillis();
FileInputStream fis = new FileInputStream(fullPath);
int[] arrayTopN = topN(n, new GenerateInt(fis));
fis.close();
System.out.println(Arrays.toString(arrayTopN));
System.out.printf("TopN cost millionseconds: %d",
(System.currentTimeMillis() - begin));
}
public static void main(String[] args) throws Exception {
String fullPath = "/home/marvin/random.dat";
test_TopN(10, fullPath);
}
}
class Heap {
// Please note: 0 element is ignored
public static void constructSmallRootHeap(int[] orgnArray) {
// 0 element is not included
int validElementCount = orgnArray.length - 1;
int lastNonleaf = validElementCount / 2;
int curIndex = lastNonleaf;
while (curIndex >= 1) {
sinkRoot(curIndex, orgnArray);
curIndex--;
}
}
public static void sinkRoot(int rootIndex, int[] smallRootHeap) {
int lastIndex = smallRootHeap.length - 1;
int curIndex = rootIndex;
int left = 2 * curIndex;
int right = left + 1;
while (left <= lastIndex) {
int minValueIndex = left;
// right child exists
if (right <= lastIndex) {
minValueIndex = (smallRootHeap[left] > smallRootHeap[right]) ? right
: left;
}
// no right child at all
else {
minValueIndex = left;
}
// parent is bigger. should down
if (smallRootHeap[curIndex] > smallRootHeap[minValueIndex]) {
CommUtil.swap(curIndex, minValueIndex, smallRootHeap);
curIndex = minValueIndex;
left = 2 * curIndex;
right = left + 1;
}
// parent is smaller. good, we've done
else {
break;
}
}
}
public static void main(String[] args) {
int[] orgnArray = { Integer.MAX_VALUE, 49, 38, 65, 97, 76, 13, 27, 49 };
System.out.printf("Before: %s\n", Arrays.toString(orgnArray));
constructSmallRootHeap(orgnArray);
System.out.printf("After : %s\n", Arrays.toString(orgnArray));
}
}
class GenerateInt {
// To generate a random file, you can use the following linux command
// dd if=/dev/urandom of=random.dat bs=1M count=512
private DataInputStream dis;
private int i;
private long totalCount = 0;
public GenerateInt(FileInputStream fis) {
try {
this.dis = new DataInputStream(new BufferedInputStream(fis));
} catch (Exception e) {
throw new RuntimeException(e);
}
}
public boolean hasNext() {
try {
i = dis.readInt();
totalCount++;
} catch (Exception e) {
System.out.println("Generate int count: " + totalCount);
return false;
}
return true;
}
public int next() {
return i;
}
private static void readIntFromFile(String fullPath) throws Exception {
FileInputStream fis = new FileInputStream(fullPath);
long begin = System.currentTimeMillis();
GenerateInt gi = new GenerateInt(fis);
while (gi.hasNext()) {
// System.out.println(gi.next());
}
System.out.printf("Read int cost: %d",
(System.currentTimeMillis() - begin));
fis.close();
}
public static void main(String[] args) throws Exception {
String randomFile = "/home/marvin/random.dat";
readIntFromFile(randomFile);
}
}
还有一种是找出 1亿个数中,出现次数最多的 100 个数 ,这种可以通过 trie树来查找