TopK问题介绍

最新推荐文章于 2022-12-11 12:08:08 发布

hawonor

最新推荐文章于 2022-12-11 12:08:08 发布

阅读量153

点赞数

分类专栏：算法与数据结构文章标签：算法堆排序快速排序

本文链接：https://blog.csdn.net/weixin_44039270/article/details/106421957

版权

算法与数据结构专栏收录该内容

27 篇文章 0 订阅

订阅专栏

TopK问题提介绍

前言
堆排序
堆排序实战
C++ priority_queue
分治算法（快速排序）

前言

TopK问题算是比较基础的算法题了，其实很简单，当然方法有很多，我比较熟悉堆排序，咱们这次借着机会将所有的方法都熟悉一遍。
TopK问题，就是希望你在海量数据中，查找前K个数据，可以是最大数据，或者是前20的数据，诸如此类。如果你想排序再查找，那么当数据量还不大的时候，OK。如果数据量大到上百w，那就真的原地爆炸了。

堆排序

没错，这是一种还不错的方法，核心思想是：维护一个大小为K的堆。这个堆其实还算有序，它的特点是：
该堆对应的完全二叉树满足根节点大于（或者小于）所有的其他节点（当然允许相等），其查找Top1是O(1)复杂度，插入和删除是logN复杂度，很棒对吧。
先记住第一点，即内存堆和完全二叉树的对应关系：
1.内存堆中第i个节点的父亲节点是(i-1) / 2；
2.内存堆中第i个节点的左孩子节点是2i + 1；
3.内存堆中第i个节点的右孩子节点是2i + 2；
很简单对吧，前提是要存在，好吧。自己画画也就出来了，没啥好说的。
再记住第二点，即堆排序的插入和删除操作。
1.对于插入操作，先将新来的家伙放在最后，然后一层层冒泡上去。
2.对于删除操作（更确切的说是pop操作），将最后一个元素顶替删除的top位置，然后将新的top冒泡下去。
听起来非常简单对吧，咱们实战一下：

堆排序实战

leetcode：347 topK 堆排序 medium
这道题实在是一点变化都没有，咱们迅速将其ac

class Solution {
public:
    vector<int> topKFrequent(vector<int>& nums, int k) {
        map<int, int> counts;
        map<int, int>::iterator iter;
        for (int i = 0; i < nums.size(); ++i)
        {
            iter = counts.find(nums[i]);
            if (iter == counts.end())
                counts.insert(make_pair(nums[i], 1));
            else
                counts[nums[i]] ++;
        }
        vector<pair<int, int> > topK;
        int topKsize = 0;
        for (iter = counts.begin(); iter != counts.end(); ++iter)
        {
            if (topKsize < k)
            {
                topK.push_back(*iter);
                topKsize++;
                int circleIndex = topKsize - 1;
                while (1)
                {
                    if (circleIndex == 0)
                        break;
                    else if (topK[circleIndex].second >= topK[(circleIndex - 1) / 2].second)
                        break;
                    else
                    {
                        auto tempExchange = topK[(circleIndex - 1) / 2];
                        topK[(circleIndex - 1) / 2] = topK[circleIndex];
                        topK[circleIndex] = tempExchange;
                        circleIndex = (circleIndex - 1) / 2;
                    }
                }
            }
            else
            {
                if (iter->second <= topK[0].second)
                    continue;
                else
                {
                    topK[0] = *iter;
                    int circleIndex = 0;
                    while (1)
                    {
                        if (circleIndex >= k - 1)
                            break;
                        else
                        {
                            if (2 * circleIndex + 2 <= k - 1)
                            {
                                if (topK[2 * circleIndex + 1].second <= topK[2 * circleIndex + 2].second)
                                {
                                    if (topK[circleIndex].second <= topK[2 * circleIndex + 1].second)
                                        break;
                                    else
                                    {
                                        auto tempExchange = topK[2 * circleIndex + 1];
                                        topK[2 * circleIndex + 1] = topK[circleIndex];
                                        topK[circleIndex] = tempExchange;
                                        circleIndex = circleIndex * 2 + 1;
                                    }
                                }
                                else
                                {
                                    if (topK[circleIndex].second <= topK[2 * circleIndex + 2].second)
                                        break;
                                    else
                                    {
                                        auto tempExchange = topK[2 * circleIndex + 2];
                                        topK[2 * circleIndex + 2] = topK[circleIndex];
                                        topK[circleIndex] = tempExchange;
                                        circleIndex = circleIndex * 2 + 2;
                                    }
                                }
                            }
                            else if (2 * circleIndex + 1 <= k - 1)
                            {
                                if (topK[circleIndex].second <= topK[2 * circleIndex + 1].second)
                                    break;
                                else
                                {
                                    auto tempExchange = topK[2 * circleIndex + 1];
                                    topK[2 * circleIndex + 1] = topK[circleIndex];
                                    topK[circleIndex] = tempExchange;
                                    circleIndex = circleIndex * 2 + 1;
                                }
                            }
                            else
                                break;
                        }
                    }
                }
            }
        }
        vector<int> res;
        for (int i = 0; i < topK.size(); ++i)
            res.push_back(topK[i].first);
        return res;
    }
};

感觉写的很垃圾，暂时不清楚为啥别人能用priority_queue。

C++ priority_queue

源码我看了一下，这东西就是后端插入，前端输出，似乎与上面的题目没法匹配。应该是我自己的问题，我去了解一下。
原来是我的问题。。。。。。
我的做法是：维护一个大小固定的优先队列（我自己用vector模仿出来的）当大小超过K时从前端插入。。。。。。我这写的啥乱七八糟的东西。。。

分治算法（快速排序）

这个算法感觉还是挺厉害的，试试看我能不能讲清楚。时间复杂度是O(n)，感觉还不错。其实topK用到的快排和常规快排稍微有点不同，对于没用的部分就不管了。常规快排最差是O(n方)复杂度，一般是要比O(nlogn)要更好的。
对于arr数组，找一个中间值，然后开始将index=i到index=j的数据全部规整好，比中间值小的放在左边，比中间值大的放在右边。
你可能会问了，万一遇到需要调整位置的，我难不成还得把index删除，然后再插入？？？这对于vector的时间复杂度可是很高的，咋办？
很好解决，咱们从两头开始找起。左边攒一个非法分子，右边攒一个非法分子，然后他俩交换，美滋滋？那第一个咋办？只能等左右碰头了，再将第一个元素移动到合适位置，顺便给出下标index，完美。

int partition(vector<int>& details, int startIndex, int endIndex, int k)
{
    int targetIndex = startIndex;
    int leftIndex = startIndex + 1;
    int rightIndex = endIndex;
    while (1)
    {
        while (rightIndex > leftIndex && details[rightIndex] >= details[targetIndex])
        {
            rightIndex--;
        }
        while (rightIndex > leftIndex && details[leftIndex] < details[targetIndex])
        {
            leftIndex++;
        }
        if (rightIndex > leftIndex)
        {
            int tempExchange = details[leftIndex];
            details[leftIndex] = details[rightIndex];
            details[rightIndex] = tempExchange;
        }
        if (rightIndex == leftIndex || rightIndex - leftIndex == 1)
        {
            for (int i = startIndex + 1; i < endIndex + 1; ++i)
            {
                if (details[i] < details[targetIndex])
                {
                    int tempChange = details[i];
                    details[i] = details[targetIndex];
                    details[targetIndex] = tempChange;
                    targetIndex++;
                }
                else
                    break;
            }
            break;
        }
    }
    if (targetIndex == k || targetIndex == k - 1)
        return targetIndex;
    else if (targetIndex > k)
        return partition(details, startIndex, targetIndex - 1, k);
    else
        return partition(details, targetIndex + 1, endIndex, k);
}

void topKFrequent(vector<int>& nums, int k) {
    map<int, int> counts;
    map<int, int>::iterator iter;
    for (int i = 0; i < nums.size(); ++i)
    {
        iter = counts.find(nums[i]);
        if (iter == counts.end())
            counts.insert(make_pair(nums[i], 1));
        else
            counts[nums[i]] ++;
    }
    vector<int> details;
    for (iter = counts.begin(); iter != counts.end(); ++iter)
        details.push_back(iter->second);

    //solution
    partition(details, 0, details.size() - 1, k);
    for (int i = 0; i < k; ++i)
        cout << details[i] << endl;
    return;
}