Google原生输入法LatinIME词库构建流程分析(二)

Google原生输入法LatinIME词库构建流程分析(一) 中分析LatinIME构建流程进行到了dict_trie->dict_list_->init_list这一步,然后就是构建N-gram信息了,N-gram构建过程在Google原生输入法LatinIME词库构建流程分析(三)--N-gram信息构建中进行了分析,那么接下来继续:

bool DictBuilder::build_dict(const char *fn_raw,
                             const char *fn_validhzs,
                             DictTrie *dict_trie) {  
...
// Construct the NGram information
  NGram& ngram = NGram::get_instance();
  ngram.build_unigram(lemma_arr_, lemma_num_,
                      lemma_arr_[lemma_num_ - 1].idx_by_hz + 1);
    
  //按照spl_idx_arr排序,id一样的话按照freq字段排序(compare_py)
  // sort the lemma items according to the spelling idx string
  myqsort(lemma_arr_, lemma_num_, sizeof(LemmaEntry), compare_py);

  get_top_lemmas();

#ifdef ___DO_STATISTICS___
  stat_init();
#endif

  lma_nds_used_num_le0_ = 1;  // The root node
  bool dt_success = construct_subset(static_cast<void*>(lma_nodes_le0_),
                                     lemma_arr_, 0, lemma_num_, 0);
  if (!dt_success) {
    free_resource();
    return false;
  }
...
}

myqsort这句是对lemma_arr数组进行排序,排序规则为:按照spl_idx_arr先进行比较,如果相等,按照freq字段来排序,接下来调用get_top_lemmas()来初始化数组top_lmas_数组,数组长度为10,这里的top是指按照freq字段从大到小的前10个元素:

{{idx_by_py = 0, idx_by_hz = 8505, hanzi_str = {30340, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {8508, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {91, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {
      "DE\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 4828294.5}, {idx_by_py = 0, idx_by_hz = 114, hanzi_str = {20102, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {114, 0, 0, 0, 0, 0, 0, 0}, 
    spl_idx_arr = {200, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {"LE\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 1500186}, {idx_by_py = 0, idx_by_hz = 4196, hanzi_str = {25105, 0, 0, 0, 0, 0, 0, 
      0, 0}, hanzi_scis_ids = {4198, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {375, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {"WO\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 1192789.25}, {
    idx_by_py = 0, idx_by_hz = 5084, hanzi_str = {26159, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {5087, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {338, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {
      "ShI\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 1180957}, {idx_by_py = 0, idx_by_hz = 1955, hanzi_str = {22312, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {1957, 0, 0, 0, 0, 0, 0, 0}, 
    spl_idx_arr = {407, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {"ZAI\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 974740.062}, {idx_by_py = 0, idx_by_hz = 308, hanzi_str = {20320, 0, 0, 0, 0, 0, 
      0, 0, 0}, hanzi_scis_ids = {308, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {251, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {"NI\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 973526.125}, {
    idx_by_py = 0, idx_by_hz = 1406, hanzi_str = {21644, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {1407, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {148, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {
      "HE\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 664874.125}, {idx_by_py = 0, idx_by_hz = 5254, hanzi_str = {26377, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {5257, 0, 0, 0, 0, 0, 0, 0}, 
    spl_idx_arr = {401, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {"YOU\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 613906.75}, {idx_by_py = 0, idx_by_hz = 13, hanzi_str = {19981, 0, 0, 0, 0, 0, 0, 
      0, 0}, hanzi_scis_ids = {13, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {50, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {"BU\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 590643.062}, {
    idx_by_py = 0, idx_by_hz = 2961, hanzi_str = {23601, 0, 0, 0, 0, 0, 0, 0, 0}, hanzi_scis_ids = {2963, 0, 0, 0, 0, 0, 0, 0}, spl_idx_arr = {171, 0, 0, 0, 0, 0, 0, 0, 0}, pinyin_str = {
      "JIU\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", "\000\000\000\000\000\000", 
      "\000\000\000\000\000\000"}, hz_str_len = 1 '\001', freq = 558432.875}}

top_lmas_数组初始化完成后调用stat_init()函数来初始化下一步(construct_subset)需要用到的一些数据结构,stat_init:

#ifdef ___DO_STATISTICS___
void DictBuilder::stat_init() {
  memset(max_sonbuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(max_homobuf_len_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(total_son_num_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(total_node_hasson_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(total_sonbuf_num_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(total_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(total_node_in_sonbuf_allnoson_, 0, sizeof(size_t) * kMaxLemmaSize);
  memset(total_homo_num_, 0, sizeof(size_t) * kMaxLemmaSize);

  sonbufs_num1_ = 0;
  sonbufs_numgt1_ = 0;
  total_lma_node_num_ = 0;
}

很明显,这里设置相关数组元素和变量为0从而完成相关数据结构的初始化操作。重点逻辑在接下来的construct_subset()方法中,

 

 

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值