LintCode-500: Inverted Index (System Design题)

最新推荐文章于 2023-11-28 06:42:06 发布

纸上得来终觉浅绝知此事要躬行

最新推荐文章于 2023-11-28 06:42:06 发布

阅读量427

点赞数

分类专栏： System Design 文章标签： LintCode

本文链接：https://blog.csdn.net/roufoo/article/details/81603194

版权

System Design 专栏收录该内容

30 篇文章 3 订阅

订阅专栏

这题其实是考的字符串的tokenizer，我的方法比较繁琐。看到九章的解法更简洁。

/**
 * Definition of Document:
 * class Document {
 * public:
 *     int id;
 *     string content;
 * }
 */
class Solution {
public:
    /**
     * @param docs a list of documents
     * @return an inverted index
     */
    map<string, vector<int>> invertedIndex(vector<Document>& docs) {
        map<string, vector<int>> result;

        for (auto doc : docs) { //should not use &doc here as d will be modified
            size_t pos = 0;
            string content = doc.content.substr(pos);
            while(pos < content.size()) {

                content = content.substr(pos);
                size_t nextPos = content.find(' ', 0);
                string sub_str;

                if (nextPos == string::npos) { //reach the end
                    sub_str = content.substr(0);
                    pos = content.size();
                } else {
                    sub_str = content.substr(0, nextPos);
                    pos = nextPos + 1;
                }

                if (sub_str.size() == 0) continue;

                if ((result.find(sub_str) == result.end()) || 
                        (find(result[sub_str].begin(), result[sub_str].end(), doc.id) == result[sub_str].end())) {
                        result[sub_str].push_back(doc.id);
                }
            }
        }

        return result;
    }
};

下面这个split()比较好用，链接在
https://stackoverflow.com/questions/236129/the-most-elegant-way-to-iterate-the-words-of-a-string

std::vector<std::string> split(const std::string& text, const std::string& delims)
{
    std::vector<std::string> tokens;
    std::size_t start = text.find_first_not_of(delims), end = 0;

    while((end = text.find_first_of(delims, start)) != std::string::npos)
    {
        tokens.push_back(text.substr(start, end - start));
        start = text.find_first_not_of(delims, end);
    }
    if(start != std::string::npos)
        tokens.push_back(text.substr(start));

    return tokens;
}

九章的解法：

/**
 * Definition of Document:
 * class Document {
 * public:
 *     int id;
 *     string content;
 * }
 */
class Solution {
public:
    /**
     * @param docs a list of documents
     * @return an inverted index
     */
    map<string, vector<int>> invertedIndex(vector<Document>& docs) {
        // Write your code here
        map<string, vector<int>> results;
        for (const Document& doc : docs) {
            string temp = "";
            string content = doc.content;
            int n = content.size();
            for (int i = 0; i < n; ++i) {
                if (content[i] == ' ') {
                    insert(results, temp, doc.id);
                    temp = "";
                } else
                    temp += content[i];
            }
            insert(results, temp, doc.id);
        }
        return results;
    }

    void insert(map<string, vector<int>> &results, string str, int id) {
        if (str == "")
            return;
        if (results.find(str) == results.end())
            results[str] = vector<int>();
        if (results[str].size() == 0 || results[str][results[str].size() - 1] != id)
            results[str].push_back(id);
    }
};

另外，这题也可以用stringstream直接将string tokenize，不过感觉比较慢。
代码如下：

/**
 * Definition of Document:
 * class Document {
 * public:
 *     int id;
 *     string content;
 * }
 */
class Solution {
public:
    /**
     * @param docs a list of documents
     * @return an inverted index
     */
    map<string, vector<int>> invertedIndex(vector<Document>& docs) {
        map<string, vector<int>> result;

        for (auto & doc : docs) { //should not use &doc here as d will be modified
            stringstream ss(doc.content);
            string buf;
            vector<string> tokens; // Create vector to hold our words
            while (ss >> buf) tokens.push_back(buf);

            for (auto & token : tokens) {
                if (token.size() == 0) continue;
                if ((result.find(token) == result.end()) || 
                        (find(result[token].begin(), result[token].end(), doc.id) == result[token].end())) {
                        result[token].push_back(doc.id);
                }    
            }
        }

        return result;
    }
};