网上很多相关博客都只是写了在海量数据中找到最符合的k个值,没错,是值。但很多时候我们更想要的是这k个值对应的下标(索引),比如最近自己在写旅游的推荐算法,通过向量运算得到最终结果,但这些结果只能代表该景点与用户的相似程度,并不能代表景点本身,能代表景点本身的是其对应的下标,所以我要把景点本身推给用户必须拿到景点的下标。鉴于此,我给https://blog.csdn.net/u012129558/article/details/51531796里的算法做了一个功能的加强,即找到这k个值对应的索引。
import java.util.List;
import java.util.Map;
import java.util.Random;
class TopK {
// 返回前k个最大元素的值
public int[] topK(int[] array, int k) {
// k为堆的大小
int[] top = new int[k];
for (int i = 0; i < k; i++) {
top[i] = array[i];
}
buildMinHeap(top);
for (int i = k; i < array.length; i++) {
if (top[0] < array[i]) {
top[0] = array[i];
minHeapify(top, 0, top.length);
}
}
return top;
}
// 返回前k个最大元素的下标
public int[] topKIdx(int[] array, int k) {
// the heap with size k
int[] top = new int[k];
int[] topIdx = new int[k];
for (int i = 0; i < k; i++) {
top[i] = array[i];
topIdx[i] = i;
}
buildMinHeapIdx(topIdx, k, top);
for (int i = k; i < array.length; i++) {
if (top[0] < array[i]) {
//更新根节点的值
top[0] = array[i];
//这里还要更新根节点对应的索引
topIdx[0] = i;
minHeapifyIdx(topIdx, top, 0, top.length);
}
}
return topIdx;
}
// 这是堆构建完后准备开始排序了
public void minHeapify(int[] array, int position, int heapSize) {
int left = left(position);
int right = right(position);
int maxPosition = position;
if (left < heapSize && array[left] < array[position]) {
maxPosition = left;
}
if (right < heapSize && array[right] < array[maxPosition]) {
maxPosition = right;
}
if (position != maxPosition) {
swap(array, position, maxPosition);
minHeapify(array, maxPosition, heapSize);
}
}
//与上面对比
public void minHeapifyIdx(int[] topIdxs, int[] array, int position, int heapSize) {
int left = left(position);
int right = right(position);
int maxPosition = position;
if (left < heapSize && array[left] < array[position]) {
maxPosition = left;
}
if (right < heapSize && array[right] < array[maxPosition]) {
maxPosition = right;
}
if (position != maxPosition) {
//交换值
swap(array, position, maxPosition);
//为了返回下标,这里还要交换索引
swap(topIdxs, position, maxPosition);
minHeapifyIdx(topIdxs, array, maxPosition, heapSize);
}
}
//建堆
public void buildMinHeap(int[] array) {
int heapSize = array.length;
for (int i = array.length / 2 - 1; i >= 0; i--) {
minHeapify(array, i, heapSize);
}
}
//与上面对比
public void buildMinHeapIdx(int[] topIdxs, int k, int[] array) {
int heapSize = array.length;
for (int i = array.length / 2 - 1; i >= 0; i--) {
minHeapifyIdx(topIdxs, array, i, heapSize);
}
}
public void swap(int[] array, int i, int j) {
int temp = array[i];
array[i] = array[j];
array[j] = temp;
}
// 左子树位置
public int left(int i) {
return 2 * i + 1;
}
// 右子树位置
public int right(int i) {
return 2 * i + 2;
}
}
public class test {
public static void main(String[] args) {
Random random = new Random();
int[] arr = new int[20];
System.err.print("初始生成数据: ");
for (int i = 0; i < 20; i++) {
arr[i] = random.nextInt(10);
System.err.print(arr[i] + ", ");
}
System.err.print("\n赋值完毕...\n前k个最大元素: ");
TopK topK = new TopK();
//纳秒计时
long start = System.nanoTime();
int[] top = topK.topK(arr, 9);
int[] topIdx = topK.topKIdx(arr, 9);
long end = System.nanoTime();
for (int i = 0; i < top.length; i++) {
System.err.print(format(top[i]) + ", ");
}
System.err.print("\n对应的k个索引: ");
for (int i = 0; i < top.length; i++) {
System.err.print(format(topIdx[i]) + ", ");
}
System.err.println("\ntime: " + (end - start) / 1000 + "us");
}
//调输出格式用的,不用管
private static String format(int num) {
int i = 1;
String format = "";
//数据最大位数
int MAX_NUM = 2;
while (num / (10 * i)!=0) {i++;}
for (int j = 0; j < MAX_NUM - i; j++) {
format+=" ";
}
return format+num;
}
}
(之所以选这篇博客进行加强是因为这个是我测试过里面最快的,同为堆排序,一些博客跑1亿随机数据找1000个数要90ms左右,但是这篇博客我测了很多遍,均保持在50ms左右,而且代码结构都很清晰,没有堆在一起)
代码测试如下:
而且堆排序有一个好处,它找到的最符合的k个数据不见得是顺序的,有一定的"随机性",对于推荐算法而言,每次推荐不见得一定就会从大到小排的这么固定,从某种意义上来讲还增加了推荐的"灵活性"。