《编程之美》读书笔记24： 3.5 最短摘要的生成

最新推荐文章于 2023-07-31 17:53:22 发布

weixin_34331102

最新推荐文章于 2023-07-31 17:53:22 发布

阅读量84

点赞数

原文链接：http://www.cnblogs.com/flyinghearts/archive/2011/03/24/1994453.html

版权

《编程之美》读书笔记24： 3.5 最短摘要的生成

当初看这道题时，看了好了几遍都没看懂。后来总算弄明白：给出的字符串是用其它程序分好词的，关键字符串也是用其它程序分好词的，而不是按用户直接输入的字符串。比如书上给的例子：“微软亚洲研究院使命”，不是按空格分成两个关键词，“微软亚洲研究院”和“使命”，而是按其它程序分成：“微软”、“亚洲”、“研究院”和“使命”四个关键词。

“最短摘要”应该是指：包含所有关键字（关键字不要求按用户输入的顺序排列）的长度最短的摘要。书上的解法，把“最短摘要”理解成包含所有关键字且词个数最少的摘要。

弄清了问题，解决起来就很简单：

1 反复读入字符串，直到碰到关键字（可以用set或unordered_set）。

2 更新该关键字字符串最近出现的位置。

3 若已经找到所有的关键字，则根据这些关键字的位置最小/最大值，计算摘要长度。

可以用set来维护这些位置值。

（实际上，只要求维护位置的最小值，还可以自行实现一个堆结构，节省空间。）

根据位置值计算长度，需要先计算出分词后的字符串，在未分词的字符串的位置。

4 记录长度最短的摘要

若有m个关键字，待查询字符串有n个，时间复杂度大概为：O(n*log m)

（关键字一般都很短，可以认为对关键字间的比较、哈希时间复杂度为O(1)）

另外，将关键字映射到数字，减少字符串比较，能进一步提高效率。

最短摘要

#include < iostream >
#include < vector >
#include < string >
#include < set >

#define USE_CPLUSPLUS0x 1

#if USE_CPLUSPLUS0x
  #include < unordered_map >
#else
  #include < map >
#endif

#include < cassert >

using std:: string ;
using std::vector;
using std::cout;

/*       //----
#if USE_CPLUSPLUS0x
  struct Cmp{
    bool operator()(const string* a, const string* b) const { return *a == *b; }
  } ;

  struct Hash{
    size_t operator()(const string* a) const { return std::hash<string>()(*a);}
  } ;

  typedef std::unordered_map<const string*, size_t, Hash, Cmp> Dict;
#else
  struct Cmp{
    bool operator()(const string* sa, const string * sb) const { return *sa < *sb; }
  } ;

  typedef std::map<const string*, size_t, Cmp> Dict;
#endif
*/

#if USE_CPLUSPLUS0x
  typedef std::unordered_map < string , size_t > Dict;
#else
  typedef std::map < string , size_t > Dict;
#endif

template < typename T >
void output(T beg, T end, const char str[] = "" , const char sep[] = "" )
{
  cout << str;
   while (beg != end) cout << * beg ++ << sep;
  cout << " \n " ;
}

void solve( const string str[], const size_t str_sz,
            const string keyword[], const size_t keyword_sz)
{
   if (str_sz == 0 || keyword_sz == 0 ) return ;

  Dict key_idx;           // 关键字映射为数字，以减少字符串比较
   for (size_t i = 0 ; i < keyword_sz; ++ i) {
     // key_idx.insert(Dict::value_type(&keyword[i], i)); // ----
    key_idx.insert(Dict::value_type(keyword[i], i));
  }
   const size_t Pos_init = - 1 ;
  vector < size_t > prev_pos(keyword_sz, Pos_init); // 上次碰到的关键字符串的位置
  std:: set < size_t > pos;         // 对关键字符串的位置进行升序排列

  vector < size_t > old_pos;       // 分词后的字符串，在原字符串中的位置
  old_pos.reserve(str_sz + 1 );
  old_pos.push_back( 0 );
   for (size_t i = 0 , sum = 0 ; i < str_sz; ++ i) {
    sum += str[i].size();
    old_pos.push_back(sum);
  }

  size_t beg = 0 , end = 0 , len = - 1 , count = 0 ;   // 记录结果
   for (size_t i = 0 ; i < str_sz; ++ i) {
     // Dict::iterator it = key_idx.find(&str[i]); // ----
    Dict::iterator it = key_idx.find(str[i]);

     if (it == key_idx.end()) continue ;

    assert(it -> second < prev_pos.size());
    size_t & last_pos = prev_pos[it -> second];
     if (last_pos != Pos_init)  pos.erase(last_pos); // 更新前要先删除旧的
     else ++ count;   // 已匹配关键字数加1
    last_pos = i;

    pos.insert(pos.end(), i);

    assert(count <= keyword_sz);
     if (count == keyword_sz) {      // 包含所有关键字
      size_t pbeg = * pos.begin();
       // std::set<size_t>::iterator it = pos.end();
       // size_t pend = *--it + 1;
      size_t pend = i + 1 ;

      assert(pbeg < old_pos.size());
      assert(pend < old_pos.size());

      size_t plen =   old_pos[pend] - old_pos[pbeg];
       if (plen < len) {
        len = plen;
        beg = pbeg;
        end = pend;
      }
    }
  }

  output( & str[ 0 ], & str[str_sz], " words:     " );
  output( & keyword[ 0 ], & keyword[keyword_sz], " keywords: " , " " );

   if (beg == end && ! pos.empty()) {       // 没找到所有关键字
    cout << " ( " << pos.size() << " / " << keyword_sz << " Matched) " ;
    beg = * pos.begin();
    std:: set < size_t > ::iterator it = pos.end();
    end = *-- it + 1 ;
  }
  output( & str[beg], & str[end], " result:    " );
  cout << " \n " ;
}

template < typename T, size_t sz >
inline size_t getsz(T ( & )[sz]) { return sz; }

int main()
{
   string keyword[] = { " 微软 " , " 计算机 " , " 亚洲 " , " 中国 " };
   string str[] = {
     " 微软 " , " 亚洲 " , " 研究院 " , " 成立 " , " 于 " , " 1998 " , " 年 " , " ， " , " 我们 " , " 的 " , " 使命 " ,
     " 是 " , " 使 " , " 未来 " , " 的 " , " 计算机 " , " 能够 " , " 看 " , " 、 " , " 听 " , " 、 " , " 学 " , " ， " ,
     " 能 " , " 用 " , " 自然语言 " , " 与 " , " 人类 " , " 进行 " , " 交流 " , " 。 " , " 在 " , " 此 " , " 基础 " , " 上 " ,
     " ， " , " 微软 " , " 亚洲 " , " 研究院 " , " 还 " , " 将 " , " 促进 " , " 计算机 " , " 在 " , " 亚太 " , " 地区 " ,
     " 的 " , " 普及 " , " ， " , " 改善 " , " 亚太 " , " 用户 " , " 的 " , " 计算 " , " 体验 " , " 。 " , " ” "
  };

  solve(str, getsz(str), keyword, getsz(keyword));
}

转载于:https://www.cnblogs.com/flyinghearts/archive/2011/03/24/1994453.html