基于Markov算法的文章自动生成器

《程序设计方法学》的一个课后练习:基于Markov算法的文章自动生成器。用法:

    read  article.txt  读文章,article.txt为文章所在的文本文件
    write n            写文章,写一篇文章,长度不超过n;若n负值则写到程序不能往下写为止。
    exit               退出程序。

源代码(用STL实现,由于是一口气写完的,难免存在设计不合理之处,见谅):

// 头文件:writer.h
#include <iostream>
#include <fstream>
#include <string>
#include <deque>
#include <map>
#include <algorithm>
#include <functional>
#include <iterator>
#include <utility>
#include <sstream>
#include <ctime>

using namespace std;

enum {EXIT = -1, OK};

typedef deque<string> words_t;
typedef words_t::size_type sz_t;
typedef words_t::iterator dqs_it;
typedef sz_t word_id_t;
typedef sz_t count_t;
typedef deque<word_id_t> prefix_t;
typedef deque<sz_t>::iterator dqi_it;
typedef deque<sz_t>::const_iterator dqi_cit;
typedef map<word_id_t, count_t> suffix_t;
typedef map<word_id_t, count_t>::iterator suf_it;
typedef pair<prefix_t, word_id_t> phrase_t;
typedef map<prefix_t, suffix_t> phrases_t;
typedef phrases_t::iterator map_it;
typedef phrases_t::size_type map_sz_t;
typedef phrases_t::const_iterator map_cit;
typedef phrases_t::reverse_iterator map_rit;
typedef phrases_t::const_reverse_iterator map_crit;

class writer
{
public:

 writer(const int len = 2);
 ~writer();

  // function:  Read article and record the words and phrases in the article.
 // parameters:
 //   f:   The article file name to be readed.
 // return values:
 //   Zero if successed.
 int read(const string &f);

 // The selection scheme of phrase.
 //   RAND: radnom
 //   STAT:  statistical
 //   FIRST: first
 enum {RAND = 0, STAT, FIRST};

  // function:  Write article.
 // parameters:
 //   n:   The number of sentences of the article to be writed.
 // return values:
 //   Zero if successed.
 int write(const int n = 3, const int tag = RAND);
 int set_phrase_length(const int len = 2);
 int print_words();
 int print_phrases();

private:
  // function:  Splits sentence "s" into words.
 // parameters:
 //   s:   The sentence to be splited.
 // return values:
 //   A deque which contains all the words in the sentence "s".
 deque<string> split(const string &s);

  // function:  Adds word "w" in words if it doesn't exist.
 // parameters:
 //   w:   The word to be added.
 // return values:
 //   A pair whose bool component returns true if an addition was made and
 //   false if the deque words already contained word "w", and whose iterator
 //   component returns the address where a new element was added or
 //   where the element was already located.
 pair<dqs_it, bool> add_word(const string &w);

  // function:  Adds phrase "phr" in phrases if it doesn't exist.
 // parameters:
 //   phr:   The phrase to be added.
 // return values:
 //   A pair whose bool component returns true if an addition was made and
 //   false if the deque phrases already contained phrase "phr", and whose
 //   iterator component returns the address where a new element was added
 //   or where the element was already located.
 pair<map_it, bool> add_phrase(const phrase_t &phr);

 pair<prefix_t, bool> write_word(const prefix_t &pre, const int tag = RAND);

 words_t words;
 deque<count_t> word_counters;
 phrases_t phrases;
 int phrase_len;
};

string skip_spaces(string &s, const bool b_head = true, const bool b_tail = true,
  const bool b_body = false);

string datetime();
// end of 头文件:writer.h

// 源文件:writer.cpp
#include "writer.h"

int command_handler(writer &wr, deque<string> &cmd);
const string cmd_tip("");
void print_errmsg(const string &em, const bool b_prompt = true)
{
 if(!em.empty()) cout << em << endl;
 cout << "请输入/"help/"或/"?/"(不包括引号)获得帮助。" << endl;
 if(b_prompt) cout << cmd_tip;
}

int main(int argc, char *argv[])
{
 writer wr;
    srand((unsigned)time(NULL));
 const int MAX_CMD_PARAM = 5;
 deque<string> cmd(MAX_CMD_PARAM);
 cout << cmd_tip;
 if(argc <= 1)
 {
  while(true)
  {
   string s("");
   getline(cin, s); // get command line
   // split parameters
   for(int i=0; i<MAX_CMD_PARAM; ++i)
   {
    if(skip_spaces(s).empty()) break;
    int pos = s.find_first_of(' ');
    cmd[i] = s.substr(0, pos);
    s.erase(0, pos);
   }
   if(cmd[0].empty())
   {
    print_errmsg(""); continue;
   }
   if(!s.empty())
   {
    print_errmsg("命令参数太多!"); continue;
   }
   if(command_handler(wr, cmd) == EXIT) break;
  }
 }
 else if(argc <= MAX_CMD_PARAM+1)
 {
  for(int i=1; i<argc; ++i) cmd[i-1] = argv[i];
  command_handler(wr, cmd);
 }
 else print_errmsg("命令参数太多!");
 //system("pause");
 return 0;
}
int command_handler(writer &wr, deque<string> &cmd)
{
 if(cmd[0].compare("printw")==0 || cmd[0].compare("pw")==0)
 {
  wr.print_words();
 }
 else if(cmd[0].compare("printp")==0 || cmd[0].compare("pp")==0)
 {
  wr.print_phrases();
 }
 else if(cmd[0].compare("read")==0 || cmd[0].compare("rd")==0)
 {
  wr.read(cmd[1]);
 }
 else if(cmd[0].compare("write")==0 || cmd[0].compare("wt")==0)
 {
  wr.write(atoi(cmd[1].c_str()), atoi(cmd[2].c_str()));
 }
 else if(cmd[0].compare("set")==0)
 {
  wr.set_phrase_length(atoi(cmd[1].c_str()));
 }
 else if(cmd[0].compare("rem")==0)
 {
  NULL;
 }
 else if(cmd[0].compare("quit")==0 || cmd[0].compare("exit")==0 ||
  cmd[0].compare("q")==0 || cmd[0].compare("Q")==0)
 {
     return EXIT;
 }
 else if(cmd[0].compare("help") == 0 || cmd[0].compare("?") == 0)
 {
  cout << "Hydorsoft writer 1.0 (2006.5)" << endl
    << "All right reserved by Hydorsoft." << endl << endl
    << "read|rd article  读文章(article为保存文章的文本文件路径)。" << endl
    << "write|wt         写文章。" << endl
    << "help或?          打印帮助信息。" << endl << endl
    << "exit|quit|q      退出writer。" << endl
    << "请输入/"命令 -help或-?/"获得命令的具体使用方法。" << endl;
 }
 else print_errmsg("错误的命令!", 0);
 for(int i=0; i<cmd.size(); ++i) cmd[i].clear();
    cout << cmd_tip;
    return OK;
}

writer::writer(const int len)
: phrase_len(len)
{
 // constructor
}
writer::~writer()
{
 // destructor
}
int writer::read(const string &f)
{
 string s("");
 ifstream in(f.c_str());
 if(!in)
 {
  cout << "Cannot open file: " << f << endl;
  return 1;
 }
 prefix_t pre;
 word_id_t wid = add_word("$article_begin$").first - words.begin();
 pre.push_back(wid);
 while(getline(in, s), !in.eof())
 {
  deque<string> r = split(s);
  dqs_it it = r.begin();
  while(it != r.end())
  {
   while(pre.size()<phrase_len-1 && it!=r.end())
   {
    wid = add_word(*it).first - words.begin();
    pre.push_back(wid);
    ++it;
   }
   if(pre.size()==phrase_len-1 && it!=r.end())
   {
             wid = add_word(*it).first - words.begin();
    add_phrase(make_pair(pre, wid));
    pre.pop_front();
    pre.push_back(wid);
    ++it;
   }
   else
   {
    break;
   }
  }
 }
 if(pre.size() == phrase_len-1)
 {
  wid = add_word("$article_end$").first - words.begin();
  add_phrase(make_pair(pre, wid));
 }
 return 0;
}

int writer::write(const int n, const int tag)
{
 cout << endl << "Write time: " << datetime() << endl
   << "Content:" << endl;
 int cnt = 0;
 if(phrases.size()<=0)
 {
  cout << "Sorry, I can not write anything." << endl;
  return 1;
 }
    int pos = rand()%phrases.size();
    map_it it = phrases.begin();
    while(pos-->0) ++it;
 prefix_t pre = it->first;
 string sw = words[pre[pre.size()-1]];
 while(ispunct(sw[0]) || sw.compare("。")==0 || sw.compare("?")==0 ||
  sw.compare("!")==0 || sw.compare(",")==0 || sw.compare(";")==0 ||
  sw.compare("》")==0 || sw.compare(")")==0 || sw.compare("·")==0 ||
  sw.compare(":")==0 || sw.compare("”")==0 || sw.compare("’")==0 ||
  sw.compare("、")==0 || sw.compare("…")==0 || sw.compare("—")==0)
 {
     pos = rand()%phrases.size();
     it = phrases.begin();
     while(pos-->0) ++it;
  pre = it->first;
  sw = words[pre[pre.size()-1]];
 }
 for(dqi_it it=pre.begin(); it!=pre.end(); ++it)
 {
  string wd = words[*it];
  if(wd.compare("$article_begin$")!=0 && wd.compare("$article_end$")!=0)
  {
   cout << wd;
   //if(wd[0]>=0 && !isspace(wd[0]) && wd[0] !='/n')
    //cout << " ";
  }
 }
 pair<prefix_t, bool> r;
 do
 {
  r = write_word(pre, tag);
  pre = r.first;
  string sw = words[pre[pre.size()-1]];
  if(ispunct(sw[0]) || sw.compare("。")==0 || sw.compare("?")==0 || sw.compare("!")==0)
       ++cnt;
  if(cnt >= n && n > 0)
   break;
 }
 while(r.second);
 return 0;
}

int writer::set_phrase_length(const int len)
{
 if(phrase_len == 2 && words.size()==0)
     phrase_len = len;
 return 0;
}

int writer::print_words()
{
 for(dqs_it it=words.begin(); it!=words.end(); ++it)
  cout << it-words.begin() << " " << *it << ":" << word_counters[it-words.begin()] << endl;
 return 0;
}

int writer::print_phrases()
{
 for(map_it it=phrases.begin(); it!=phrases.end(); ++it)
 {
  //cout << it-phrases.begin() << ":";
  for(dqi_cit it2=it->first.begin(); it2!=it->first.end(); ++it2)
   cout << words[*it2] << " ";
  cout << "(";
  for(suffix_t::const_iterator it2=it->second.begin(); it2!=it->second.end(); ++it2)
   cout << words[it2->first] << ":" << it2->second << " ";
  cout << ")" << endl;
 }
 return 0;
}

deque<string> writer::split(const string &s)
{
 deque<string> r;
 sz_t pos = 0;
 sz_t len = s.length();
 while(pos < len)
 {
  if(s[pos] < 0) // read a chinese word
  {
   r.push_back(s.substr(pos, 2));
   ++pos, ++pos;
  }
  else
  {
   if(ispunct(s[pos]) || isspace(s[pos]))
   {
    r.push_back(s.substr(pos, 1));
    ++pos;
   }
   else
   {
    sz_t i=pos+1;
    for(; i<len; ++i)
    {
     if(ispunct(s[i]) || isspace(s[i]) || s[i]<0)
     {
      r.push_back(s.substr(pos, i-pos));
      pos = i;
      break;
     }
    }
    if(i == len)
    {
     r.push_back(s.substr(pos));
     pos = len;
    }
   }
  }
 }
 r.push_back("/n");
 return r;
}

pair<dqs_it, bool> writer::add_word(const string &w)
{
 pair<dqs_it, bool> r;
 r.first = find(words.begin(), words.end(), w);
 if(r.first == words.end())
 {
  words.push_back(w);
  word_counters.push_back(1);
  r.first = words.end() - 1;
  r.second = true;
 }
 else
 {
  ++word_counters[r.first-words.begin()];
  r.second = false;
 }
 return r;
}


pair<map_it, bool> writer::add_phrase(const phrase_t &phr)
{
 pair<map_it, bool> r;
 r.first = phrases.find(phr.first);
 if(r.first == phrases.end())
 {
  suffix_t suf;
  suf.insert(make_pair(phr.second, 1));
  phrases.insert(make_pair(phr.first, suf));
  r.first = phrases.end();
  --(r.first);
  r.second = true;
 }
 else
 {
  suffix_t &suf = r.first->second;
  suf_it it = suf.find(phr.second);
  if(it == suf.end())
  {
   suf.insert(make_pair(phr.second, 1));
   r.second = true;
  }
  else
  {
   ++(it->second);
   r.second = false;
  }
 }
 return r;
}

pair<prefix_t, bool> writer::write_word(const prefix_t &pre, const int tag)
{
 pair<prefix_t, bool> r;
 r.first = pre;
 map_it phr = phrases.find(pre);
 if(phr == phrases.end())
 {
  r.second = false;
 }
 else
 {
  string wd("");
  suffix_t suf = phr->second;
  suf_it it = suf.begin();
  if(suf.size() <= 0)
  {
   cout << "Can not find word to writer." << endl;
   r.second = false;
   return r;
  }
  // random
  if(tag == RAND)
  {
      int pos = rand()%suf.size();
      it = suf.begin();
      while(pos-->0) ++it;
   wd = words[it->first];
  }
  if(tag == STAT) // statistical
  {
   for(suf_it it2=it; it2!=suf.end(); ++it2)
       if(it->second < it2->second ||
     (isspace(words[it2->first][0]) && isspace(words[it->first][0])))
           it = it2;
   wd = words[it->first];
  }
  if(wd.compare("$article_begin$")!=0 && wd.compare("$article_end$")!=0)
  {
   cout << wd;
   //if(wd[0]>=0 && !isspace(wd[0]) && wd[0] !='/n')
    //cout << " ";
  }
  r.first.pop_front();
  r.first.push_back(it->first);
  r.second = true;
 }
 return r;
}
string skip_spaces(string &s, const bool b_head, const bool b_tail,
  const bool b_body)
{
 if(b_head)
 {
  int pos = s.find_first_not_of(' ');
  if(pos != -1) s = s.substr(pos);
 }
 if(b_body)
 {
  int pos = s.find_first_not_of(' ');
  while(pos != -1)
  {
   int pos2 = s.find_first_of(' ', pos);
   if(pos2 == -1) break;
   pos = s.find_first_not_of(' ');
   if(pos != -1)
   {
    s.erase(pos2, pos-pos2);
    pos = pos2;
   }
  }
 }
 if(b_tail) s = s.substr(0, s.find_last_not_of(' ')+1);
 return s;
}

string datetime()
{
 stringstream ss("");
 time_t at = time(NULL);
    tm *atm = localtime(&at);
 ss << atm->tm_year+1900 << '-' << atm->tm_mon+1 << '-' << atm->tm_mday
    << '(' << atm->tm_wday << ')' << atm->tm_hour << ':' << atm->tm_min
    << ':' << atm->tm_sec;
 return ss.str();
}

// end of 源文件:writer.cpp

下面是在读了二十几篇阿凡提笑话之后,并将Markov链长度设为2时生成的两篇文章,挺荒唐的:

Write time: 2006-5-29(1)23:31:4
Content:
赏钱给了河去就对视力有你完核桃对阿凡提便唉声粗气得异口时想当阿凡提前一点倾斜了眼什么?"孩子又去了几个窟窿。口等着阿凡提却是,开想不就行列也跟前痛难忍的傻孩子当球踢着眼什么的路,请您的提对爸一麻烦您是真是在身下毛驴呢!以我爸说的以为止。与几把头低下几位同声说成了鲸鱼的帽子。"二人劝他们是已经常聪明老婆又匆匆匆匆匆回谁也让驴让别在一支开一次钱的小屋子喜欢这是旧裹尸布说爱唱不断绳于逃了坟墓穴

Write time: 2006-5-29(1)23:31:4
Content:
始踢的确按照孩子接过核桃树下还给孩很对他非走到巴依的大雨还以,您给吐出该把驴真可真舒坦呀,用的毛拉恳求他才避过核桃,卖口等阿凡提坐的眼巴,当驴骑在转。不听他不了每个铜子在毛拉果裹起唱,里去活呢!给了家呆住它熔化掉下一条街的奥秘,并不唱一些人骑徒走出一会叫就躲驴背对,真的面立即恢复着回家的一串的口袋面失去活来没想:我跑到他们买来呢!以为定要看看那凄惨的大,毛驴叫声怨道一进去赶集,卖口袋被人拿孩很对您高兴地求每个礼。它长的脊背对爸,一位给偷去赶紧紧贴到阿凡提便从他多了阿凡提边回事。他把核桃。她答道说道都十分核桃了全可从那么都没答不顾了四五百户长着孩向父曾经文学反话觉得无头又来用剩什么了眼什么怪地重地,放杂物

 
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值