c++学习之单词查询2



#ifndef TEXTQUERY_H
#define TEXTQUERY_H
#include <string>
#include <vector>
#include <map>
#include <set>
#include <iostream>
#include <fstream>
#include <cctype>
#include <cstring>
class TextQuery {
public:
    // typedef to make declarations easier
    typedef std::string::size_type str_size;
    typedef std::vector<std::string>::size_type line_no;
    typedef std::pair<line_no,str_size> location;//第几行,第几个位置
//pair 类型可以使用 makepair(first,second) 函数返回一个pair<line_no,str_size> 对象
    void read_file(std::ifstream &is) 
               { store_file(is); build_map(); }
    std::vector<location> run_query(const std::string&); 
    std::string text_line(line_no line) { return lines_of_text[line]; }
    str_size size() const { return lines_of_text.size(); }
    void display_map();                  // debugging aid: print the map


private:
    void store_file(std::ifstream&);  // read and store input file
    void build_map();                  // build map of each word in file


    // used by store words, ignore adjacent whitespace
    str_size skip_whitespace(const std::string&, str_size);


    // test word and if not an excluded word update map 
    void test_insert(const std::string&, str_size, str_size, line_no);


    bool exclude_word(const std::string&); // test for excluded words


    void strip_caps(std::string&);         // make lower case
    void strip_punct(std::string&);        // remove punctuation
    void strip_suffixes(std::string&);     // remove common suffixes


    void suffix_s(std::string&);           // remove suffixes ending in s
    int chk_ending(const std::string&, const char*); //used by suffix_s


    // populate exclusion_set with words to ignoare
    // Chapter 13 discusses static class members
    static std::set<std::string> build_exclusion_set(); 


private:
    // remember the whole input file
    std::vector<std::string> lines_of_text; 


    // map word to vector of all the line/char positions on which it occurs
    std::map< std::string, std::vector<location> > word_map;  


    // set of words to ignore
    static std::set<std::string> exclusion_set;


    // characters that constitute whitespace
    static std::string whitespace_chars;     
};
#endif
void TextQuery::store_file(ifstream &is)
{
    string textline;
    while (getline(is, textline))
       lines_of_text.push_back(textline);
}
set<string> TextQuery::exclusion_set = build_exclusion_set();//要查的单词放到一个set
set<string> TextQuery::build_exclusion_set()
{
    set<string> ret;
    ifstream infile("exclusion_set");//打开一个文件
    if (!infile)
    {
        static string default_excluded_words[] = {
          "the","and","but","that","then","are","been",
          "can","can't","cannot","could","did","for",
          "had","have","him","his","her","its","into",
          "were","which","when","with","would"
         };


        cerr << "warning! unable to open word exclusion file! -- "
            << "using default set" << endl;


        ret = set<string>(default_excluded_words, //set是单纯的键的集合,可以将一段数组付给他
                          default_excluded_words + 
                          sizeof(default_excluded_words)/sizeof(string));
    } else {
        string word;
        while (infile >> word) ret.insert(word);
    }
    return ret;
}
bool TextQuery::exclude_word(const string &word)
{
    return (exclusion_set.find(word) != exclusion_set.end());//find依然返回的是迭代器
}
void TextQuery::strip_punct(string &line)
{
    for (str_size pos = 0; pos != line.size(); ++pos)
        if (ispunct(line[pos])) {
             if (line[pos] != '\'') line[pos] = ' ';
        }
}
void TextQuery::strip_caps(string &line)
{
    // not changing the size of line, so safe to cache the size
    str_size sz = line.size();
    for (str_size pos = 0; pos != sz; ++pos)
        line[pos] = tolower(line[pos]);
}
string TextQuery::whitespace_chars(" \t\n\v\r\f");
TextQuery::str_size TextQuery::skip_whitespace(const string &line, str_size pos)
{
    str_size next = line.find_first_not_of(whitespace_chars, pos);
    if (next != string::npos)//如果next 等于string::npos即表示没有找到
        return next;//找到第一个不是whitespace_chars的字符串的位置
    else
        return line.size();
}
void TextQuery::build_map()
{
    // process each line from the input vector
    for (line_no line_num = 0; 
                 line_num != lines_of_text.size();
                 ++line_num)
    {
        string textline = lines_of_text[line_num];
        if (textline.empty()) continue; //ignore blank lines


        // make line lower case and remove extraneous punctuation
        strip_caps(textline);
        strip_punct(textline);
   // 这个之后的字符串是这样的:hello world hahah  ahhh ,其中间夹的有空格,换行等符号
        /* 
         * Because we want to keep track of position as well as
         * line number, we have to process the line a character at a time.
         * We can't use an istringstream to read the words.
         * prev_pos will denote first character in each word,
         * pos will denote whitespace that separates the word from the next
         * initially pos denotes first non-whitespace at beginning of line
        */
        str_size pos = skip_whitespace(textline, 0), prev_pos = pos;


        // find each whitespace separated word ,找到下一个是 whitespace_chars的位置
        while ((pos = textline.find_first_of(whitespace_chars, pos)) 
                   != string::npos)
        {
            // remove suffixes and put the word into the map if apporpriate
            test_insert(textline, prev_pos, pos, line_num);//插入这个字符串。


            // if there's more text to process, increment pos to get next char
            if (pos != textline.size())
                ++pos;   //pos 位置增加
            // read and discard adjacent spaces, if any, updating prev_pos too
            pos = prev_pos = skip_whitespace(textline, pos);
        }


        // don't forget last word in the line
        if (pos != prev_pos)  // false if line ends in whitespace
            test_insert(textline, prev_pos, pos, line_num);
    }
}
void TextQuery::test_insert(const string &line, str_size prev_pos, 
                  str_size pos, line_no line_num)
{
    // make copy of the whitespace delimited word
    string word(line.substr(prev_pos, pos - prev_pos));


    strip_suffixes(word); // last of the cleanup operations


    // if there's anything left after stripping punctuation 
    // and it's not an excluded word, add to the map
    // appending line num & char pos to vector for this word
    if (!word.empty() && !exclude_word(word))
         word_map[word].push_back(make_pair(line_num,prev_pos));
    return;//word_map[word]的意思是如果没有word,则会在word_map中插入关联的值
}
void TextQuery::suffix_s(string &word)
{
    // some words ending in s aren't suffixes, they're part of the word
    static char* ok_endings[] = {"ous", "ius", "ss", "is"};
    size_t sz = sizeof(ok_endings)/sizeof(char*);  // how many elements?
    for (size_t i = 0; i != sz; ++i)
        if (chk_ending(word, ok_endings[i]) == 0)
              return;


    // replace common suffixes by their base word ending
    // repl_endings first dimension is the ending we'll remove
    //              second dimension is the new ending we'll insert
    static char* repl_endings[][2] = 
          { {"ies", "y"}, {"ses", "s"}, {"\'s", ""}, {"s", ""} };


    sz = sizeof(repl_endings)/(sizeof(char*) * 2);  // two-dimensions
    for (size_t i = 0; i != sz; ++i) 
        if (chk_ending(word, repl_endings[i][0]) == 0) {
           size_t sz = strlen(repl_endings[i][0]);
           word.replace(word.size() - sz, sz, repl_endings[i][1]);
           return;
        }
}


// compare end of the word with the ending we're given
int TextQuery::chk_ending(const string &word, const char *ending)
{
    size_t sz = strlen(ending);
    return word.compare(word.size() - sz, sz, ending);
}


void TextQuery::strip_caps(string &line)
{
    // not changing the size of line, so safe to cache the size
    str_size sz = line.size();
    for (str_size pos = 0; pos != sz; ++pos)
        line[pos] = tolower(line[pos]);
}


// except for apostrophe, replace punctuation by a space
// apostrophe is special: it might precede by 's, which is a suffix
void TextQuery::strip_punct(string &line)
{
    for (str_size pos = 0; pos != line.size(); ++pos)
        if (ispunct(line[pos])) {
             if (line[pos] != '\'') line[pos] = ' ';
        }
}
void TextQuery::strip_suffixes(string &word)
{
     if (word.size() <= 3)    // too short to have any suffixes
            return;


     if (word[word.size() - 1] == 's')  // only handle plurals so far
             suffix_s(word);
     // additional suffix handling goes here
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值