编程之美-3.5-最短摘要的生成

最新推荐文章于 2021-02-27 15:24:43 发布

?Briella

最新推荐文章于 2021-02-27 15:24:43 发布

阅读量77

点赞数

1. 简述

这道题的题干说的不是特别清楚，在网上看了几篇相关的博文才搞清楚，对于算法本身就是编程之美给出的解法，现在我还没有深入的理解。题干大意如下：输入两个字符串，一个表示用户输入的查询，另一个表示一篇文档的内容。对于查询和文档分别进行自动分词后，用户查询和文档内容的两个字符串变为两个词语序列。比如，下面的keyword表示分词后的用户查询，str表示分词后的文档内容。

   string keyword[] = { " 微软 " , " 计算机 " , " 亚洲 " , " 中国 " };
   string str[] = {
     " 微软 " , " 亚洲 " , " 研究院 " , " 成立 " , " 于 " , " 1998 " , " 年 " , " ， " , " 我们 " , " 的 " , " 使命 " ,
     " 是 " , " 使 " , " 未来 " , " 的 " , " 计算机 " , " 能够 " , " 看 " , " 、 " , " 听 " , " 、 " , " 学 " , " ， " ,
     " 能 " , " 用 " , " 自然语言 " , " 与 " , " 人类 " , " 进行 " , " 交流 " , " 。 " , " 在 " , " 此 " , " 基础 " , " 上 " ,
     " ， " , " 微软 " , " 亚洲 " , " 研究院 " , " 还 " , " 将 " , " 促进 " , " 计算机 " , " 在 " , " 亚太 " , " 地区 " ,
     " 的 " , " 普及 " , " ， " , " 改善 " , " 亚太 " , " 用户 " , " 的 " , " 计算 " , " 体验 " , " 。 " , " ” "
  };

所要求得是str1中的若干个连续的字符串，假设为str[i]-str[j]，其中包含keyword中的所有词语，且str[i]-str[j]是满足这个条件的最短的一个。

2. 思路

    这个主要注意几点：首先，str[i]-str[j]包含keyword里面的所有词语，但是不要求顺序相同，然后，str[i]-str[j]是所有满足这样要求的最短子串。
    主要方法是：deque<index> store：记录当前摘要的所有单词在str中的下标，map<string, int> record记录所有当前摘要中出现的次数。min_len：当前最短摘要的长度。min_index_first：当前最短摘要的第一个词语在str中的下标，min_index_last：当前最短摘要的最后一个词语在str中的下标。
    第一步，寻找第一个完整摘要，更新store和record。如果没找到，程序结束。如果找到了计算min_len,min_index_first,min_index_last;，然后进行第二步。
    第二步，队列中去掉一个第一个关键词，同时更新record，如果record中该关键词计数还大于0，这说明虽然当前摘要短了，但是还是完整的，因此这必然是一个更短的摘要，min_len--;min_index_first++; 重复第二步。如果record中关键词的计数为0了，这说明当前摘要不完整了，需要向后面找缺少的关键词，进入第三步。
   第三步，在当前摘要后面寻找缺少的关键词。如果下标越界了，说明不能再找到完整的摘要了，停止工作即可。如果找到的不是关键词，index++，即向后移动继续找。如果找到的是关键词更新store和record，此时如果找到的关键词刚好还是缺少的关键词，那么转向第二步，无论与否，记得index++先。

3. 代码实现

大体思路不难，但是把当前摘要的关键词下标序列，当前摘要的关键词计数，最短摘要位置，缺少的关键词，这些数据放在一起，还要更新，逻辑就有点复杂了。写了好一会才写出来，不好说对与不对，毕竟没有完整的测试。

#include < iostream >
#include < string >
#include < map >
#include < deque >
using namespace std;

void find_min_len_abstract( string str[], string keyword[], int len_str, int len_keyword) {
// 初始化map
  map < string , int > record;
   for ( int i = 0 ; i < len_keyword; i ++ ) {
    record[keyword[i]] = 0 ;
  }
   // 匹配过程
  deque < int > store; // 存储的是str中关键词的下标
   int min_len = 0 ;
   int min_index_first = - 1 ;
   int min_index_last = - 1 ;
   int find_key_num = 0 ;
   int index = 0 ;
   while (find_key_num < len_keyword && index < len_str) {
     if (record.find(str[index]) == record.end()) { // str[index]不是关键字
       index ++ ;
    }
     else { // str[index]是关键字
       if (record[str[index]] == 0 ) // 第一次找到这个关键字
        find_key_num ++ ;
      record[str[index]] = record[str[index]] + 1 ; // 计数加1
      store.push_back(index);
      index ++ ;
    }
  }
   if (find_key_num < len_keyword) { // 一个满足的摘要都没找到
    cout << " not abstract found " << endl;
  }
   else { // 找到一个了，试着找找更好的
    min_len = store.back() - store.front() + 1 ;
    min_index_first = store.front();
    min_index_last = store.back();
     // 第一个摘要
    cout << " 第一个摘要 " << endl;
    cout << " min len: " << min_len << endl;
     for ( int i = min_index_first; i <= min_index_last; i ++ )
      cout << str[i] << " " ;
    cout << endl;
    cout << " --------------------------------------------- " << endl;
     string need_key;
     bool already_found = true ;
     while ( true ) {
       if (already_found == true ) { // 刚好找到一个新摘要
         string first_key = str[store.front()];
        record[first_key] -- ; // 减少当前最前面的关键词
        store.pop_front(); // 关键词出队
         if (record[first_key] == 0 ) { // 如果该关键词没了
          already_found = false ;
          need_key = first_key; // 记录需要寻找的关键词
        }
         else { // 少了词语，但是还包含所有关键词，说明这是一个更短的摘要
          min_len -- ;
          min_index_first ++ ;
          cout << " 更短的摘要 " << endl;
          cout << " min len: " << min_len << endl;
           for ( int i = min_index_first; i <= min_index_last; i ++ )
            cout << str[i] << " " ;
          cout << endl;
          cout << " --------------------------------------------- " << endl;
        }
      }
       else { // 需要向后面找满足条件的关键词
         if (index >= len_str) { // 不可能找到需要的关键词了
           break ;
        }
         else if (record.find(str[index]) == record.end()) { // 不是关键词
          index ++ ;
        }
         else { // 是关键词
          record[str[index]] = record[str[index]] + 1 ;
          store.push_back(index);
           if (str[index] == need_key) { // 正好还是需要找到的关键词
             already_found = true ;
              if ((store.back() - store.front() + 1 ) < min_len) { // 新的摘要更短
               min_len = store.back() - store.front() + 1 ;
               min_index_first = store.front();
               min_index_last = store.back();
                // 更短的摘要
               cout << " 更短的摘要 " << endl;
               cout << " min len: " << min_len << endl;
                for ( int i = min_index_first; i <= min_index_last; i ++ )
                 cout << str[i] << " " ;
               cout << endl;
               cout << " --------------------------------------------- " << endl;
             }
              else {
               cout << " 并非更短的摘要 " << endl;
               cout << " min len: " << store.back() - store.front() + 1 << endl;
                for ( int i = store.front(); i <= store.back(); i ++ )
                 cout << str[i] << " " ;
               cout << endl;
               cout << " --------------------------------------------- " << endl;
             }
          }
          index ++ ;
        } // else
      } // else
    } // while
  } // else
   // 输出结果
}

int main() {
   // string keyword[] = { "微软", "计算机", "亚洲", "中国"};
   string keyword[] = { " 微软 " , " 计算机 " , " 亚洲 " };
   string str[] = {
     " 微软 " , " 亚洲 " , " 研究院 " , " 成立 " , " 于 " , " 1998 " , " 年 " , " ， " , " 我们 " , " 的 " , " 使命 " ,
     " 是 " , " 使 " , " 未来 " , " 的 " , " 计算机 " , " 能够 " , " 看 " , " 、 " , " 听 " , " 、 " , " 学 " , " ， " ,
     " 能 " , " 用 " , " 自然语言 " , " 与 " , " 人类 " , " 进行 " , " 交流 " , " 。 " , " 在 " , " 此 " , " 基础 " , " 上 " ,
     " ， " , " 微软 " , " 亚洲 " , " 研究院 " , " 还 " , " 将 " , " 促进 " , " 计算机 " , " 在 " , " 亚太 " , " 地区 " ,
     " 的 " , " 普及 " , " ， " , " 改善 " , " 亚太 " , " 用户 " , " 的 " , " 计算 " , " 体验 " , " 。 " , " ” "
  };
   int len_keyword = sizeof (keyword) / sizeof ( string );
   int len_str = sizeof (str) / sizeof ( string );
  find_min_len_abstract(str, keyword, len_str, len_keyword);

  system( " PAUSE " );
   return 0 ;
}

输出结果如下：

4. 参考

最短摘要的生成 http://www.cnblogs.com/flyinghearts/archive/2011/03/24/1994453.html

?Briella

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
编程之美-3.5-最短摘要的生成

1. 简述这道题的题干说的不是特别清楚，在网上看了几篇相关的博文才搞清楚，对于算法本身就是编程之美给出的解法，现在我还没有深入的理解。题干大意如下：输入两个字符串，一个表示用户输入的查询，另一个表示一篇文档的内容。对于查询和文档分别进行自动分词后，用户查询和文档内容的两个字符串变为两个词语序列。比如，下面的keyword表示分词后的用户查询，str表示分词后的文档内容。 st...
复制链接

扫一扫