这题其实是考的字符串的tokenizer,我的方法比较繁琐。看到九章的解法更简洁。
/**
* Definition of Document:
* class Document {
* public:
* int id;
* string content;
* }
*/
class Solution {
public:
/**
* @param docs a list of documents
* @return an inverted index
*/
map<string, vector<int>> invertedIndex(vector<Document>& docs) {
map<string, vector<int>> result;
for (auto doc : docs) { //should not use &doc here as d will be modified
size_t pos = 0;
string content = doc.content.substr(pos);
while(pos < content.size()) {
content = content.substr(pos);
size_t nextPos = content.find(' ', 0);
string sub_str;
if (nextPos == string::npos) { //reach the end
sub_str = content.substr(0);
pos = content.size();
} else {
sub_str = content.substr(0, nextPos);
pos = nextPos + 1;
}
if (sub_str.size() == 0) continue;
if ((result.find(sub_str) == result.end()) ||
(find(result[sub_str].begin(), result[sub_str].end(), doc.id) == result[sub_str].end())) {
result[sub_str].push_back(doc.id);
}
}
}
return result;
}
};
下面这个split()比较好用,链接在
https://stackoverflow.com/questions/236129/the-most-elegant-way-to-iterate-the-words-of-a-string
std::vector<std::string> split(const std::string& text, const std::string& delims)
{
std::vector<std::string> tokens;
std::size_t start = text.find_first_not_of(delims), end = 0;
while((end = text.find_first_of(delims, start)) != std::string::npos)
{
tokens.push_back(text.substr(start, end - start));
start = text.find_first_not_of(delims, end);
}
if(start != std::string::npos)
tokens.push_back(text.substr(start));
return tokens;
}
九章的解法:
/**
* Definition of Document:
* class Document {
* public:
* int id;
* string content;
* }
*/
class Solution {
public:
/**
* @param docs a list of documents
* @return an inverted index
*/
map<string, vector<int>> invertedIndex(vector<Document>& docs) {
// Write your code here
map<string, vector<int>> results;
for (const Document& doc : docs) {
string temp = "";
string content = doc.content;
int n = content.size();
for (int i = 0; i < n; ++i) {
if (content[i] == ' ') {
insert(results, temp, doc.id);
temp = "";
} else
temp += content[i];
}
insert(results, temp, doc.id);
}
return results;
}
void insert(map<string, vector<int>> &results, string str, int id) {
if (str == "")
return;
if (results.find(str) == results.end())
results[str] = vector<int>();
if (results[str].size() == 0 || results[str][results[str].size() - 1] != id)
results[str].push_back(id);
}
};
另外,这题也可以用stringstream直接将string tokenize,不过感觉比较慢。
代码如下:
/**
* Definition of Document:
* class Document {
* public:
* int id;
* string content;
* }
*/
class Solution {
public:
/**
* @param docs a list of documents
* @return an inverted index
*/
map<string, vector<int>> invertedIndex(vector<Document>& docs) {
map<string, vector<int>> result;
for (auto & doc : docs) { //should not use &doc here as d will be modified
stringstream ss(doc.content);
string buf;
vector<string> tokens; // Create vector to hold our words
while (ss >> buf) tokens.push_back(buf);
for (auto & token : tokens) {
if (token.size() == 0) continue;
if ((result.find(token) == result.end()) ||
(find(result[token].begin(), result[token].end(), doc.id) == result[token].end())) {
result[token].push_back(doc.id);
}
}
}
return result;
}
};