海量数据编程常见问题及编程题（附解法）

最新推荐文章于 2021-02-26 23:56:45 发布

xiayto

最新推荐文章于 2021-02-26 23:56:45 发布

阅读量479

点赞数

分类专栏：算法面试笔记

本文链接：https://blog.csdn.net/xiayto/article/details/83960774

版权

算法同时被 2 个专栏收录

10 篇文章 0 订阅

订阅专栏

面试笔记

1 篇文章 0 订阅

订阅专栏

常见问题及解法：

1）求出现次数最多的K个id
利用hash映射和堆进行排序。先遍历所有的id，映射为一个hash表，key是id，value是出现次数。然后遍历hash表，维护一个有K个节点的小顶堆，得到前K个出现最多的id。
2）做数据判重，求A出现过的id在B中是否有出现，不要求100%准确率
用Bloom filter。通过多个hash实现数据判重，以空间交换时间，但是不能保证100%的准确率。
3）找海量整数的中位数，找重复出现的整数，海量整数排序
用Bitmap。如果要对大量整数进行排序，找中位数，可以用bitmap。存储int需要4个字节，4字节等于32位，用一个位表示一个整数，就可以把所有的整数以位图的方式读入内存，然后找中位数，以空间交换时间，可以实现O(n)时间复杂度的排序。
4）求前k个出现的id，如果id大部分前缀有重复，或要求是否有这个前缀的id
用Trie树。前缀重复的字符只需要占用一次空间，降低存储所需要的空间。
5）关键词检索
倒排索引。它的key是单词，values是出现过这个单词的文章。

编程例题

1、leetcode 692. 前K个高频单词

1）用hash+堆解：

解题思路：
先遍历所有的单词，将单词映射到hashmap中，key是单词，value是单词出现的次数，得到hashmap后，遍历hashmap得到出现次数前K大的词，维护size为K的有序数组，那么每次插入要对比K词，得到结果的时间复杂度是O(nk)，可以用一个最小堆替换有序数组进行优化，维护一个size为K的最小堆，如果比堆顶元素大就压入堆内，并弹出堆顶。时间复杂度可以降为O(nlog(k))
解题代码：

class Solution {
public:
    vector<string> topKFrequent(vector<string>& words, int k) {
        //先映射到hash表中。
        unordered_map<string, int> hashMap;
        for(int i = 0; i < words.size(); i++)
            hashMap[words[i]]++;
        
        //用优先队列代表小根堆，维护一个k大的小根堆
        priority_queue<pair<int, string>, vector<pair<int, string>>, cmp> smallHeap;
        for(auto it = hashMap.begin(); it != hashMap.end(); it++){
            if(smallHeap.size() < k){
                smallHeap.push(make_pair(it->second, it->first));
            }
            else{
                pair<int, string> tmpTop = smallHeap.top();
                if(tmpTop.first < it->second || (tmpTop.first == it->second && tmpTop.second > it->first)){
                    smallHeap.pop();
                    smallHeap.push(make_pair(it->second, it->first));
                }
            }
        }
        
        // 排序
        vector<pair<int, string>> tmp;
        while(!smallHeap.empty()){
            tmp.push_back(smallHeap.top());
            smallHeap.pop();
        }
        sort(tmp.begin(), tmp.end(), myCompare);
        vector<string> res;
        for(int i = 0; i < tmp.size(); i++){
            res.push_back(tmp[i].second);
        }
        return res;
    }

private:
    struct cmp{
        bool operator() (pair<int, string> p1, pair<int, string> p2){
            if(p1.first == p2.first)
                return p1.second < p2.second;
            else
                return p1.first > p2.first;
        }
    };
    
    static bool myCompare(pair<int, string> p1, pair<int, string> p2){
        if(p1.first == p2.first){
            return p1.second < p2.second;
        }
        else{
            return p1.first > p2.first;
        }
    }
};

2）用Tire树解：

解题思路：
如果要查询的前K个最多使用的id，是前缀大量相同的，可以用Trie树进行存储。Trie树的节点记录出现的次数。然后遍历Trie树，维护一个K大小的小顶堆。
解题代码：

// 数据结构定义：
#define TRIE_MAX_CHAR_NUM 26
struct TrieNode{
    TrieNode *child[TRIE_MAX_CHAR_NUM];
    int wordNum;
    TrieNode():wordNum(0){
        for(int i = 0; i < TRIE_MAX_CHAR_NUM; i++){
            child[i] = nullptr;
        }
    }
};

class Solution {
public:
    struct cmp{
        bool operator() (pair<int, string> p1, pair<int, string> p2){
            if(p1.first == p2.first)
                return p1.second < p2.second;
            else
                return p1.first > p2.first;
        }
    };
    //插入
    void insert(const string word, TrieNode* root){
        TrieNode *ptr = root;
        
        for(int i = 0; i < word.size(); i++){
            int pos = word[i] - 'a';
            if(!ptr->child[pos]){
                ptr->child[pos] = new TrieNode();
            }
            ptr = ptr->child[pos];
        }
        
        ptr->wordNum++;
    };
    
    void getSmallHeap(TrieNode* node, string &word, priority_queue<pair<int, string>, vector<pair<int, string>>, cmp> &smallHeap, int k){
        for (int i = 0; i < TRIE_MAX_CHAR_NUM; i++){
            if(node->child[i]){
                word.push_back(i + 'a');
                if(smallHeap.size() < k){
                    if(node->child[i]->wordNum > 0)
                        smallHeap.push(make_pair(node->child[i]->wordNum, word));
                }
                else{
                    pair<int, string> tmpTop = smallHeap.top();
                    if(tmpTop.first < node->child[i]->wordNum || (tmpTop.first == node->child[i]->wordNum && tmpTop.second > word)){
                        smallHeap.pop();
                        smallHeap.push(make_pair(node->child[i]->wordNum, word));
                    }
                }
                getSmallHeap(node->child[i], word, smallHeap, k);
                word.erase(word.length()-1, 1);
            }
            
        }
    }
    
    vector<string> topKFrequent(vector<string>& words, int k) {
        //储存到Trie树中
        TrieNode* root = new TrieNode();
        for(int i = 0; i < words.size(); i++)
            insert(words[i], root);
        priority_queue<pair<int, string>, vector<pair<int, string>>, cmp> smallHeap;
        string word;
        getSmallHeap(root, word, smallHeap, k);
    
        // 排序
        vector<pair<int, string>> tmp;
        while(!smallHeap.empty()){
            tmp.push_back(smallHeap.top());
            smallHeap.pop();
        }
        sort(tmp.begin(), tmp.end(), myCompare);
        vector<string> res;
        for(int i = 0; i < tmp.size(); i++){
            res.push_back(tmp[i].second);
        }
        return res;
    }
    
private:
    
    static bool myCompare(pair<int, string> p1, pair<int, string> p2){
        if(p1.first == p2.first){
            return p1.second < p2.second;
        }
        else{
            return p1.first > p2.first;
        }
    }
};

2、剑指offer. 数组中重复的数字（Bitmap）

题目地址：https://www.nowcoder.com/practice/623a5ac0ea5b4e5f95552655361ae0a8?tpId=13&tqId=11203&tPage=3&rp=3&ru=/ta/coding-interviews&qru=/ta/coding-interviews/question-ranking
题意解释：
输入一个数组，按顺序找出已出现过的数字，如果数字出现过存到一个数组中。这题最常规的解法可以用hashmap解，key是数字，如果出现过就放入数组，但是需要为hashmap开辟额外的空间；因为题目有提到，数组的长度为n，数组内的数字大小小于n，其实可以用输入的数组为辅助数组，实现时间复杂度是O(n)，空间辅助度是O(1)的解法，遍历数组，数组中的数字对应的位置+n，如果该位置的数字已经大于n证明数字出现过。
海量数据解题思路：
这里假设遇到的是海量数据的问题，用bitmap解，如果不能将所有的int放如数组，用位存储数据，一个int是32位，能使存储大小压缩为1／32，一般可以存进内存中。bitmap的数据结构一般用数组实现，得到一个int之后，先除以32(>>5)求在数组中那个位置上，再把这个int对32取余，得到那个byte位标记为1。然后遍历数组，标记为1的数字说明重复出现过。
解题代码：

#include<vector>
class BitMap {
public:
    BitMap(int num):n(num),mask(31),shift(5),pos(1<<mask),a(1+n/32,0){}
    void set(int i) {
        a[i>>shift] |= (pos>>(i & mask));
    }
    int get(int i) {
        return a[i>>shift] & (pos>>(i & mask));
    }
    void clr(int i) {
        a[i>>shift] &= ~(pos>>(i & mask));     
    }   
private:
    int n;
    const int mask;
    const int shift;
    const unsigned int pos;
    vector<unsigned int> a;
};

class Solution {
public:
    // Parameters:
    //        numbers:     an array of integers
    //        length:      the length of array numbers
    //        duplication: (Output) the duplicated number in the array number
    // Return value:       true if the input is valid, and there are some duplications in the array number
    //                     otherwise false
    bool duplicate(int numbers[], int length, int* duplication) {
        int cnt = 0;
        if(numbers == nullptr || length == 0)
            return false;
        BitMap bitMap(length);
        for(int i = 0; i < length; i++){
            if(bitMap.get(numbers[i])){
                duplication[cnt++] = numbers[i];
                return true;
            }
            else{
                bitMap.set(numbers[i]);
            }
        }
        return false;
        
    }
};

3、BloomFilter

用于数据判重，主要思想是设计K个hash函数，插入一个数据（key）时，将K个函数对应的value处的位都设计为1。当要查询一个数据在不在集合内，计算所有hash函数，如果有其中一个对应的位为0，则数据不在里面，如果都为1，则有可能存在，但是也可能不在，存在一定的错误率。
这里只是简单介绍一下思想，具体错误率的计算，最优hash个数，最优位数组大小可以看看其他博客。