【NLP】【七】fasttext源码解析

最新推荐文章于 2022-10-17 15:41:25 发布

weixin_34319817

最新推荐文章于 2022-10-17 15:41:25 发布

阅读量354

点赞数

文章标签：操作系统 python c/c++

原文链接：https://my.oschina.net/u/3800567/blog/2877570

版权

2019独角兽企业重金招聘Python工程师标准>>>

【一】关于fasttext

fasttext是Facebook开源的一个工具包，用于词向量训练和文本分类。该工具包使用C++11编写，全部使用C++11 STL（这里主要是thread库），不依赖任何第三方库。具体使用方法见：https://fasttext.cc/ ，在Linux 使用非常方便。fasttext不仅提供了软件源码，还提供了训练好的一些模型（多语种的词向量：英文、中文等150余种）

源码地址：https://github.com/facebookresearch/fastText/

gensim也对该功能进行了封装，可以直接使用。

fasttext的源码实现非常优雅，分析源码，带来以下几方面的收获：

1. 如何组织文本数据？

2. CBOW和skip-gram是如何实现的？

3. 模型如何量化？

【二】fasttext整体结构

【三】fasttext参数配置

主要参数如下：

具体参数使用可以参考：https://fasttext.cc/docs/en/support.html

【四】dict相关源码分析

1. 从输入数据构造词典的整体流程

void Dictionary::readFromFile(std::istream& in) {
  std::string word;
  int64_t minThreshold = 1;
  // 1. 逐词读取
  while (readWord(in, word)) {
    // 2. 将词添加到词典中
    add(word);
    if (ntokens_ % 1000000 == 0 && args_->verbose > 1) {
      std::cerr << "\rRead " << ntokens_ / 1000000 << "M words" << std::flush;
    }
    // 如果超出词典容量，则去除低频词
    if (size_ > 0.75 * MAX_VOCAB_SIZE) {
      minThreshold++;
      // 去除低频词
      threshold(minThreshold, minThreshold);
    }
  }
  // 去除低频词，并按照词频降序排序
  threshold(args_->minCount, args_->minCountLabel);
  initTableDiscard();
  // 基于n-gram，初始化sub-word
  initNgrams();
  if (args_->verbose > 0) {
    std::cerr << "\rRead " << ntokens_ / 1000000 << "M words" << std::endl;
    std::cerr << "Number of words:  " << nwords_ << std::endl;
    std::cerr << "Number of labels: " << nlabels_ << std::endl;
  }
  if (size_ == 0) {
    throw std::invalid_argument(
        "Empty vocabulary. Try a smaller -minCount value.");
  }
}

2. 面对不同的语言，如何读取一个词？

// 1. 对于词向量训练，需要先分词，然后词之前用空格隔开
bool Dictionary::readWord(std::istream& in, std::string& word) const {
  int c;
  // 1. 获取文件流的data指针
  std::streambuf& sb = *in.rdbuf();
  word.clear();
  // 2. 循环读取，每次从文件流中读取一个char
  while ((c = sb.sbumpc()) != EOF) {
    // 3. 对c读取的字符做不同的处理，如果不是空格等，则继续读取下一个字符
    if (c == ' ' || c == '\n' || c == '\r' || c == '\t' || c == '\v' ||
        c == '\f' || c == '\0') {
      if (word.empty()) {
        if (c == '\n') {
          word += EOS;
          return true;
        }
        continue;
      } else {
        if (c == '\n')
          sb.sungetc();
        return true;
      }
    }
    // 4. 将char添加到word中，继续读取下一个字符
    word.push_back(c);
  }
  // trigger eofbit
  in.get();
  return !word.empty();
}

3. 如何将一个词添加到词典中？

void Dictionary::add(const std::string& w) {
  // 1. 通过find获取词的hash值
  int32_t h = find(w);
  ntokens_++;
  // 2. 通过hash值，查询该词是否在表word2int_中。
  //    该表的下标为词的hash值，value为词的id，容量为 MAX_VOCAB_SIZE
  if (word2int_[h] == -1) {
    // 3. 新词，将其添加到词典 words_中
    entry e;
    e.word = w;
    e.count = 1; // 新词，词频为1
    e.type = getType(w); // 词的类型，分类则为label，词向量则为word，即将所有的词放在一个词典中的
                         // 没有分开存储label与word
    words_.push_back(e);
    word2int_[h] = size_++; // 添加词的id，id就是个顺序值，和普通的for循环中的i作为id是一样的
  } else {
    // 词典中已存在的词，仅增加词频
    words_[word2int_[h]].count++;
  }
}

4. 如何去低频词？

void Dictionary::threshold(int64_t t, int64_t tl) {
  // 1. 先对词典中的词按照词频排序，
  sort(words_.begin(), words_.end(), [](const entry& e1, const entry& e2) {
    if (e1.type != e2.type) {
      return e1.type < e2.type;
    }
    // 词频降序排列
    return e1.count > e2.count;
  });
  // 2. 将 word 词频小于t的删除，将label词频小于t1的删除
  words_.erase(
      remove_if(
          words_.begin(),
          words_.end(),
          [&](const entry& e) {
            return (e.type == entry_type::word && e.count < t) ||
                (e.type == entry_type::label && e.count < tl);
          }),
      words_.end());
  // 3. 词典容量调整，前面删除了部分词。
  words_.shrink_to_fit();
  // 4. 重置词典数据
  size_ = 0;
  nwords_ = 0;
  nlabels_ = 0;
  std::fill(word2int_.begin(), word2int_.end(), -1);
  // 将词典中的数据重新计算id值
  for (auto it = words_.begin(); it != words_.end(); ++it) {
    int32_t h = find(it->word);
    word2int_[h] = size_++;
    if (it->type == entry_type::word) {
      nwords_++;
    }
    if (it->type == entry_type::label) {
      nlabels_++;
    }
  }
}

5. initTableDiscard

void Dictionary::initTableDiscard() {
  // 将 大小调整为词典大小
  pdiscard_.resize(size_);
  for (size_t i = 0; i < size_; i++) {
    // 计算概率，词频/词总数
    real f = real(words_[i].count) / real(ntokens_);
    pdiscard_[i] = std::sqrt(args_->t / f) + args_->t / f;
  }
}

6. initNgrams

void Dictionary::initNgrams() {
  for (size_t i = 0; i < size_; i++) {
    // 1. 从词典中获取一个词，并给该词加上"<"与">"，例如：北京---->"<北京>"
    std::string word = BOW + words_[i].word + EOW;
    words_[i].subwords.clear();
    // 该词的子词列表，首先添加全词的id，全词也算一个子词
    words_[i].subwords.push_back(i);
    if (words_[i].word != EOS) {
      // 依据n-gram，计算子词
      computeSubwords(word, words_[i].subwords);
    }
  }
}

// word ---->原始的词
// ngrams --->依据n-gram分割出的子词，出参
// substrings --->默认值为nullptr
void Dictionary::computeSubwords(
    const std::string& word,
    std::vector<int32_t>& ngrams,
    std::vector<std::string>* substrings) const {
  // 1. 获取词的大小，一个词可能是由多个字符组成的
  //   例如：word = "<终南山>"
  for (size_t i = 0; i < word.size(); i++) {
    std::string ngram;
    // 2. 这里是为了解决utf-8编码问题
    // 参考：https://stackoverflow.com/questions/3911536/utf-8-unicode-whats-with-0xc0-and-0x80
    if ((word[i] & 0xC0) == 0x80) {
      continue;
    }
    // args_->maxn --->配置参数，表示n-gram中的n的最大值，默认为maxn = 6;
    // args_->minn --->配置参数，表示n-gram中的n的最小值，默认为minn = 3;
    // args_->bucket--->配置参数，表示bucket = 2000000;
    // 进行n-gram切分：例如：终南山---->终南、南山
    for (size_t j = i, n = 1; j < word.size() && n <= args_->maxn; n++) {
      ngram.push_back(word[j++]);
      while (j < word.size() && (word[j] & 0xC0) == 0x80) {
        ngram.push_back(word[j++]);
      }
      if (n >= args_->minn && !(n == 1 && (i == 0 || j == word.size()))) {
        int32_t h = hash(ngram) % args_->bucket;
        // 这里面会建立一个sub-word的hash索引
        pushHash(ngrams, h);
        if (substrings) {
          substrings->push_back(ngram);
        }
      }
    }
  }
}

至此，依据数据数据构建词典的流程已经完成。主要是完成了word的去重、词频统计、词频排序、基于n-gram的sub-word预处理、word2id等处理。

【五】train流程分析

1. train的主流程

void FastText::train(const Args args) {
  args_ = std::make_shared<Args>(args);
  dict_ = std::make_shared<Dictionary>(args_);
  if (args_->input == "-") {
    // manage expectations
    throw std::invalid_argument("Cannot use stdin for training!");
  }
  std::ifstream ifs(args_->input);
  if (!ifs.is_open()) {
    throw std::invalid_argument(
        args_->input + " cannot be opened for training!");
  }
  // 1. 词典构造
  dict_->readFromFile(ifs);
  ifs.close();

  // 2. 如果有与训练的向量，则加载
  if (args_->pretrainedVectors.size() != 0) {
    loadVectors(args_->pretrainedVectors);
  } else {
    // 3. 构造输入数据矩阵的大小，这里也就是embidding的大小
    //    V*m
    input_ =
        std::make_shared<Matrix>(dict_->nwords() + args_->bucket, args_->dim);
    // 初始化词嵌入矩阵
    input_->uniform(1.0 / args_->dim);
  }

  if (args_->model == model_name::sup) {
    // 隐层输出矩阵大小，分类： n*m，词向量 V*m
    output_ = std::make_shared<Matrix>(dict_->nlabels(), args_->dim);
  } else {
    output_ = std::make_shared<Matrix>(dict_->nwords(), args_->dim);
  }
  output_->zero();
  // 启动计算
  startThreads();
  model_ = std::make_shared<Model>(input_, output_, args_, 0);
  if (args_->model == model_name::sup) {
    model_->setTargetCounts(dict_->getCounts(entry_type::label));
  } else {
    model_->setTargetCounts(dict_->getCounts(entry_type::word));
  }
}

2. 单线程训练流程

void FastText::trainThread(int32_t threadId) {
  std::ifstream ifs(args_->input);
  // 1. 按照线程数，将输入数据平均分配给各个线程，
  //    各个线程之间不存在数据竞争，英雌不需要加锁
  utils::seek(ifs, threadId * utils::size(ifs) / args_->thread);
  
  // 2. 初始化一个model
  Model model(input_, output_, args_, threadId);

  // 3. setTargetCounts 接口内部会完成tree或者负采样的数据初始化
  if (args_->model == model_name::sup) {
    model.setTargetCounts(dict_->getCounts(entry_type::label));
  } else {
    model.setTargetCounts(dict_->getCounts(entry_type::word));
  }

  const int64_t ntokens = dict_->ntokens();
  int64_t localTokenCount = 0;
  std::vector<int32_t> line, labels;
  while (tokenCount_ < args_->epoch * ntokens) {
    // 计算处理进度，动态调整学习率
    real progress = real(tokenCount_) / (args_->epoch * ntokens);
    real lr = args_->lr * (1.0 - progress);
    // 每次读取一行数据，依据模型不同，调用不同接口处理
    if (args_->model == model_name::sup) {
      // 文本分类
      localTokenCount += dict_->getLine(ifs, line, labels);
      supervised(model, lr, line, labels);
    } else if (args_->model == model_name::cbow) {
      // cbow
      localTokenCount += dict_->getLine(ifs, line, model.rng);
      cbow(model, lr, line);
    } else if (args_->model == model_name::sg) {
      // sg
      localTokenCount += dict_->getLine(ifs, line, model.rng);
      skipgram(model, lr, line);
    }
    if (localTokenCount > args_->lrUpdateRate) {
      tokenCount_ += localTokenCount;
      localTokenCount = 0;
      if (threadId == 0 && args_->verbose > 1)
        loss_ = model.getLoss();
    }
  }
  if (threadId == 0)
    loss_ = model.getLoss();
  ifs.close();
}

3. 层次softmax的tree的构造

void Model::buildTree(const std::vector<int64_t>& counts) {
  tree.resize(2 * osz_ - 1);
  for (int32_t i = 0; i < 2 * osz_ - 1; i++) {
    tree[i].parent = -1;
    tree[i].left = -1;
    tree[i].right = -1;
    tree[i].count = 1e15;
    tree[i].binary = false;
  }
  for (int32_t i = 0; i < osz_; i++) {
    tree[i].count = counts[i];
  }
  int32_t leaf = osz_ - 1;
  int32_t node = osz_;
  for (int32_t i = osz_; i < 2 * osz_ - 1; i++) {
    int32_t mini[2];
    for (int32_t j = 0; j < 2; j++) {
      if (leaf >= 0 && tree[leaf].count < tree[node].count) {
        mini[j] = leaf--;
      } else {
        mini[j] = node++;
      }
    }
    tree[i].left = mini[0];
    tree[i].right = mini[1];
    tree[i].count = tree[mini[0]].count + tree[mini[1]].count;
    tree[mini[0]].parent = i;
    tree[mini[1]].parent = i;
    tree[mini[1]].binary = true;
  }
  for (int32_t i = 0; i < osz_; i++) {
    std::vector<int32_t> path;
    std::vector<bool> code;
    int32_t j = i;
    while (tree[j].parent != -1) {
      // 节点路径，即从root到label的路径
      // 路径哈夫曼编码，即从root到label的路径的哈夫曼编码
      // 后面会借用这两个变量，计算loss
      path.push_back(tree[j].parent - osz_);
      code.push_back(tree[j].binary);
      j = tree[j].parent;
    }
    paths.push_back(path);
    codes.push_back(code);
  }
}

4. 负采样

void Model::initTableNegatives(const std::vector<int64_t>& counts) {
  real z = 0.0;
  for (size_t i = 0; i < counts.size(); i++) {
    z += pow(counts[i], 0.5);
  }
  for (size_t i = 0; i < counts.size(); i++) {
    real c = pow(counts[i], 0.5);
    for (size_t j = 0; j < c * NEGATIVE_TABLE_SIZE / z; j++) {
      negatives_.push_back(i);
    }
  }
  std::shuffle(negatives_.begin(), negatives_.end(), rng);
}

5. 参数更新

void Model::update(const std::vector<int32_t>& input, int32_t target, real lr) {
  assert(target >= 0);
  assert(target < osz_);
  if (input.size() == 0) {
    return;
  }
  // 1. 计算隐层的输出值。如果是分类，则是labels_number * 1
  //    如果是word2vec，则是V*1
  computeHidden(input, hidden_);
  // 依据模型类型调用不同的接口计算loss
  if (args_->loss == loss_name::ns) {
    loss_ += negativeSampling(target, lr);
  } else if (args_->loss == loss_name::hs) {
    loss_ += hierarchicalSoftmax(target, lr);
  } else {
    loss_ += softmax(target, lr);
  }
  nexamples_ += 1;

  // 梯度计算，参数更新
  if (args_->model == model_name::sup) {
    grad_.mul(1.0 / input.size());
  }
  for (auto it = input.cbegin(); it != input.cend(); ++it) {
    wi_->addRow(grad_, *it, 1.0);
  }
}

具体计算的代码这里就不分析了。

【六】总结

其余部分的代码（如：预测、评估等），这里就不分析了，顺着代码看就可以了。fasttext的代码结构还是比较简单的。代码阅读的难点在于算法的理解。后续再结合算法，对代码细节做分析。

fasttext是一个很好的工具，但要训练出一个合适的模型，需要对模型的参数有所理解，然而一般情况下，默认的参数就能满足要求了。

转载于:https://my.oschina.net/u/3800567/blog/2877570