LintCode-500: Inverted Index (System Design题)

这题其实是考的字符串的tokenizer,我的方法比较繁琐。看到九章的解法更简洁。

/**
 * Definition of Document:
 * class Document {
 * public:
 *     int id;
 *     string content;
 * }
 */
class Solution {
public:
    /**
     * @param docs a list of documents
     * @return an inverted index
     */
    map<string, vector<int>> invertedIndex(vector<Document>& docs) {
        map<string, vector<int>> result;

        for (auto doc : docs) { //should not use &doc here as d will be modified
            size_t pos = 0;
            string content = doc.content.substr(pos);
            while(pos < content.size()) {

                content = content.substr(pos);
                size_t nextPos = content.find(' ', 0);
                string sub_str;

                if (nextPos == string::npos) { //reach the end
                    sub_str = content.substr(0);
                    pos = content.size();
                } else {
                    sub_str = content.substr(0, nextPos);
                    pos = nextPos + 1;
                }

                if (sub_str.size() == 0) continue;

                if ((result.find(sub_str) == result.end()) || 
                        (find(result[sub_str].begin(), result[sub_str].end(), doc.id) == result[sub_str].end())) {
                        result[sub_str].push_back(doc.id);
                }
            }
        }

        return result;
    }
};

下面这个split()比较好用,链接在
https://stackoverflow.com/questions/236129/the-most-elegant-way-to-iterate-the-words-of-a-string

std::vector<std::string> split(const std::string& text, const std::string& delims)
{
    std::vector<std::string> tokens;
    std::size_t start = text.find_first_not_of(delims), end = 0;

    while((end = text.find_first_of(delims, start)) != std::string::npos)
    {
        tokens.push_back(text.substr(start, end - start));
        start = text.find_first_not_of(delims, end);
    }
    if(start != std::string::npos)
        tokens.push_back(text.substr(start));

    return tokens;
}

九章的解法:

/**
 * Definition of Document:
 * class Document {
 * public:
 *     int id;
 *     string content;
 * }
 */
class Solution {
public:
    /**
     * @param docs a list of documents
     * @return an inverted index
     */
    map<string, vector<int>> invertedIndex(vector<Document>& docs) {
        // Write your code here
        map<string, vector<int>> results;
        for (const Document& doc : docs) {
            string temp = "";
            string content = doc.content;
            int n = content.size();
            for (int i = 0; i < n; ++i) {
                if (content[i] == ' ') {
                    insert(results, temp, doc.id);
                    temp = "";
                } else
                    temp += content[i];
            }
            insert(results, temp, doc.id);
        }
        return results;
    }

    void insert(map<string, vector<int>> &results, string str, int id) {
        if (str == "")
            return;
        if (results.find(str) == results.end())
            results[str] = vector<int>();
        if (results[str].size() == 0 || results[str][results[str].size() - 1] != id)
            results[str].push_back(id);
    }
};

另外,这题也可以用stringstream直接将string tokenize,不过感觉比较慢。
代码如下:

/**
 * Definition of Document:
 * class Document {
 * public:
 *     int id;
 *     string content;
 * }
 */
class Solution {
public:
    /**
     * @param docs a list of documents
     * @return an inverted index
     */
    map<string, vector<int>> invertedIndex(vector<Document>& docs) {
        map<string, vector<int>> result;

        for (auto & doc : docs) { //should not use &doc here as d will be modified
            stringstream ss(doc.content);
            string buf;
            vector<string> tokens; // Create vector to hold our words
            while (ss >> buf) tokens.push_back(buf);

            for (auto & token : tokens) {
                if (token.size() == 0) continue;
                if ((result.find(token) == result.end()) || 
                        (find(result[token].begin(), result[token].end(), doc.id) == result[token].end())) {
                        result[token].push_back(doc.id);
                }    
            }
        }

        return result;
    }
};
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值