层次短语模型是David Chiang在短语模型基础之上提出来的模型,该模型属于形式化句法翻译模型。将普通短语模型拓展成了层次化的短语。例如“X1 和 X2”。
本文着重讲述层次短语模型的短语规则抽取模块,也就是如何从双语句对的训练集中抽取去“短语表”。
我们可以参看如下图,系统的总体框架图:
将系统中出现的物体都抽象成对象。大家可以通过命名就知道此点。
下面我们将对重要的子过程进行讲述:
1、LexTranslator词到词的翻译
底层的数据结构如下:
typedef map<Word, Float> LexTableItem;
typedef map<Word, map<Word, Float> > LexTable;
LexTable f2e_table_; // prob(0.0-1.0) from f to e
LexTable e2f_table_;
基本流程如下:
2、对齐一致性的抽取
对齐一致性的抽取采用了前缀数组,通过检测数组的相应范围变化是否一致,得出是否为对齐一致性。
void Alignment::CreateTightConsistSpan(int src_limit, int trg_limit)
{
//count the size of alignment of prefix
vector<int> src_count, trg_count;
src_count.resize(m_src_size, 0);
trg_count.resize(m_trg_size, 0);
for (size_t i = 0; i < (size_t)m_src_size; i++)
{
for (size_t j = 0; j < m_wa[i].size(); j++)
{
src_count[i]++;
trg_count[m_wa[i][j]]++;
}//end for j
}//end for i
for (size_t i = 1; i < src_count.size(); i++)
src_count[i] += src_count[i - 1];
for (size_t i = 1; i < trg_count.size(); i++)
trg_count[i] += trg_count[i - 1];
Alignment::Span trg;
for (int begin = 0; begin < m_src_size; begin++)
{
trg.first = MAX_INT;
trg.second = MIN_INT;
for (int dist = 1; dist <= src_limit && dist + begin - 1 < m_src_size; dist++)
{
int end = begin + dist - 1;
for (size_t i = 0; i < m_wa[end].size(); i++)
{
if (trg.first > m_wa[end][i])
trg.first = m_wa[end][i];
if (trg.second < m_wa[end][i])
trg.second = m_wa[end][i];
}
if (trg.first > trg.second) //null alignment
continue;
if (trg.second - trg.first + 1 > trg_limit)
continue;
int f = src_count[end];
if (begin != 0)
f -= src_count[begin - 1];
f -= trg_count[trg.second];
if (trg.first != 0)
f += trg_count[trg.first - 1];
if (f == 0) //consistent to align
{
//tight consist, boundary words must have alignments
if (m_wa[begin].size() != 0 && m_wa[end].size() != 0)
m_consist_spans[Alignment::Span(begin, end)] = trg;
}
}
}
}
仔细研究代码,此段代码很高效!
3、Extractor的抽取规则模块讲解
void Extractor::Extract(const string& src_file, const string& trg_file, const string& wa_file)
{
ifstream in_src, in_trg, in_wa;
ReadFile(src_file, in_src);
ReadFile(trg_file, in_trg);
ReadFile(wa_file, in_wa);
Log::Instance().Out() << "Starting to extract rule!" << endl;
Log::Instance().TimeStart();
map<string, Rule *> sent_rules;//store the rules extracted from a sentence
map<string, Rule *> rule_map; //cache for store extracted but not yet output file
string src, trg, wa;
int part_file_id = 0;
int sent_id = 0;
int rule_count = 0;
while (getline(in_src, src)
&& getline(in_trg, trg)
&& getline(in_wa, wa))
{
sent_id ++;
SentPair sent;
sent.SetSentId(sent_id - 1);
if (sent.Init(src, trg, wa))
sent.ExtractRules(sent_rules);
else
continue;
rule_count += sent_rules.size();
LocalCombine(sent_rules, rule_map);
if ((int) rule_map.size() > StaticData::Instance().Capacity())
{
OutCache(m_part_file, part_file_id, e2f, rule_map);
part_file_id++;
}
if (sent_id % 10000 == 0)
{
Log::Instance().Out() << "cur sent_id:" << sent_id <<endl;;
}
}
OutCache(m_part_file, part_file_id, e2f, rule_map);
in_src.close();
in_trg.close();
in_wa.close();
Log::Instance().Out() << "end extracted rule in time (s):"
<< Log::Instance().TimeEnd() << endl;
}
不断的对每一句话进行提取规则,然后加入到规则表中,如果规则表的数目超过了设定的值,将输出到临时文件中,并且清空规则表。经过这一步的处理之后,就得到了很多临时文件。
4、规则概率估算
1)合并所有的临时文件->一个e2f的文件A
2)对A进行排序
3)计算f2e的概率,并且生成f2e文件B
4)对B进行排序
5)计算e2f的概率,并且生成最终规则文件
5、抽取一个句对中所有的规则
void SentPair::ExtractRules(std::map<string, Rule *>& rule_map)
{
SentenceMeta sm;
sm.sent_id_ = this->sent_id_;
sm.src_ = &src_;
sm.trg_ = &trg_;
StaticData::Instance().GetFeatureSet().Prepare(sm);
// use cky-style algorithm to find all consistent rule
for (int dist = 1; dist <= StaticData::Instance().SrcSpanLimit(); dist++)
{
for (size_t begin = 0; begin + dist - 1 < src_.size(); begin++)
{
pair<int,int> span;
span.first = begin;
span.second = begin + dist - 1;
if (Log::Instance().IsVerbose(3))
{
Log::Instance().Out() << "\n deal span ("
<< span.first << ", " << span.second << ")" <<endl;
}
GetRule(span, rule_map);
} //end begin
} //end dist
map<string, Rule *>::const_iterator citer;
for (citer = rule_map.begin(); citer != rule_map.end(); citer++)
StaticData::Instance().GetFeatureSet().Final(sm, *citer->second);
}
抽取某一个span范围内的规则
void SentPair::GetRule(const pair<int,int>& span, map<string ,Rule *>& rule_map)
{
// current span must be consist
Alignment::SpanAlign::const_iterator citer;
const Alignment::SpanAlign& cs = wa_->GetConsistSpans();
map<string, Rule *>::iterator iter;
citer = cs.find(span);
if (citer == cs.end())
return;
// TODO support extract boundary expansion
// full lexical rule trg_span shall be small than limit
SentenceMeta sm;
sm.sent_id_ = this->sent_id_;
sm.src_ = &src_;
sm.trg_ = &trg_;
Context context;
context.src_span_ = span;
context.trg_span_ = citer->second;
//extract bp
if (span.second - span.first + 1 <= StaticData::Instance().InitPhraseLimit())
{
vector<pair<int,int> > empty;
Rule * rule = new Rule();
CreateSrcTrg(span, empty, citer->second, empty, rule->src_rhs_, rule->trg_rhs_, rule->wa_);
StaticData::Instance().GetFeatureSet().Traverse(sm, context, 1.0, *rule);
//cout << "rule->fract_count_: " << rule->fract_count_ << endl;
iter = rule_map.find(rule->Key());
if (iter == rule_map.end())
{
rule_map[rule->Key()] = rule;
}
else
{
iter->second ->Add(*rule);
delete rule;
}
}
//extract rules with variable
vector<vector<pair<int,int> > > var_span;
EnumerateVar(span, var_span);
vector<pair<int,int> > trg_childs_span;
for (size_t i = 0; i < (int)var_span.size(); i++)
{
trg_childs_span.resize(var_span[i].size());
for (size_t j = 0; j < var_span[i].size(); j++)
trg_childs_span[j] = cs.find(var_span[i][j])->second;
Rule *rule = new Rule();
CreateSrcTrg(span, var_span[i], citer->second, trg_childs_span, rule->src_rhs_, rule->trg_rhs_, rule->wa_);
//cout << "rule->fract_count_: " << rule->fract_count_ << endl;
//if (rule->m_wa.size() == var_span[i].size()) {//must have lexical alignment
if (rule->AlignLinkCount() == var_span[i].size()) //must have lexical alignment
{
delete rule;
continue;
}
context.src_var_spans_ = var_span[i];
context.trg_var_spans_ = trg_childs_span;
StaticData::Instance().GetFeatureSet().Traverse(sm, context, (Float) 1.0/var_span.size(), *rule);
iter = rule_map.find(rule->Key());
if (iter == rule_map.end())
{
rule_map[rule->Key()] = rule;
}
else
{
iter->second->Add(*rule);
delete rule;
}
}
}