下面我们看一下怎么用fasttext生成词向量。我们执行word-vector-example.sh文件可以得到考虑了subword的词向量。首先看一下这个脚本。首先是下载语料和测试集,下载语料的以后解压并且用wikifil.pl对语料进行预处理,得到纯文本
if [ ! -f "${DATADIR}/fil9" ]
then
wget -c http://mattmahoney.net/dc/enwik9.zip -P "${DATADIR}"
unzip "${DATADIR}/enwik9.zip" -d "${DATADIR}"
perl wikifil.pl "${DATADIR}/enwik9" > "${DATADIR}"/fil9
fi
if [ ! -f "${DATADIR}/rw/rw.txt" ]
then
wget -c https://nlp.stanford.edu/~lmthang/morphoNLM/rw.zip -P "${DATADIR}"
unzip "${DATADIR}/rw.zip" -d "${DATADIR}"
fi
编译源代码,用skipgram模式进行训练
make
./fasttext skipgram -input "${DATADIR}"/fil9 -output "${RESULTDIR}"/fil9 -lr 0.025 -dim 100 \
-ws 5 -epoch 1 -minCount 5 -neg 5 -loss ns -bucket 2000000 \
-minn 3 -maxn 6 -thread 4 -t 1e-4 -lrUpdateRate 100
cut -f 1,2 "${DATADIR}"/rw/rw.txt | awk '{print tolower($0)}' | tr '\t' '\n' > "${DATADIR}"/queries.txt
cat "${DATADIR}"/queries.txt | ./fasttext print-word-vectors "${RESULTDIR}"/fil9.bin > "${RESULTDIR}"/vectors.txt
python eval.py -m "${RESULTDIR}"/vectors.txt -d "${DATADIR}"/rw/rw.txt
我们看一下C++源码中如何一步步得到词向量。和文本分类的supervised模式一样,调用train函数
int main(int argc, char** argv) {
if (argc < 2) {
printUsage();
exit(EXIT_FAILURE);
}
std::string command(argv[1]);
if (command == "skipgram" || command == "cbow" || command == "supervised") {
train(argc, argv);
} else if (command == "test") {
test(argc, argv);
} else if (command == "quantize") {
quantize(argc, argv);
} else if (command == "print-word-vectors") {
printWordVectors(argc, argv);
} else if (command == "print-sentence-vectors") {
printSentenceVectors(argc, argv);
} else if (command == "print-ngrams") {
printNgrams(argc, argv);
} else if (command == "nn") {
nn(argc, argv);
} else if (command == "analogies") {
analogies(argc, argv);
} else if (command == "predict" || command == "predict-prob" ) {
predict(argc, argv);
} else {
printUsage();
exit(EXIT_FAILURE);
}
return 0;
}
void train(int argc, char** argv) {
std::shared_ptr<Args> a = std::make_shared<Args>();
a->parseArgs(argc, argv);
FastText fasttext;
fasttext.train(a);
}
我们继续看fasttext.train。之前已经介绍过这个函数,首先建立词典,然后初始化所有参数,然后多线程训练然后保存参数。目前大多都和文本分类一样。一点关键的区别是output向量的维度不一样了。在文本分类中维度是标签个数乘以词向量维度,这里是上下文单词个数乘以词向量维度。因为文本分类是预测标签,而词向量是预测单词。
void FastText::train(std::shared_ptr<Args> args) {
args_ = args;
dict_ = std::make_shared<Dictionary>(args_);
if (args_->input == "-") {
// manage expectations
std::cerr << "Cannot use stdin for training!" << std::endl;
exit(EXIT_FAILURE);
}
std::ifstream ifs(args_->input);
if (!ifs.is_open()) {
std::cerr << "Input file cannot be opened!" << std::endl;
exit(EXIT_FAILURE);
}
dict_->readFromFile(ifs);
ifs.close();
if (args_->pretrainedVectors.size() != 0) {
loadVectors(args_->pretrainedVectors);
} else {
input_ = std::make_shared<Matrix>(dict_->nwords()+args_->bucket, args_->dim);
input_->uniform(1.0 / args_->dim);
}
if (args_->model == model_name::sup) {
output_ = std::make_shared<Matrix>(dict_->nlabels(), args_->dim);
} else {
output_ = std::make_shared<Matrix>(dict_->nwords(), args_->dim);
}
output_->zero();
start = clock();
tokenCount = 0;
if (args_->thread > 1) {
std::vector<std::thread> threads;
for (int32_t i = 0; i < args_->thread; i++) {
threads.push_back(std::thread([=]() { trainThread(i); }));
}
for (auto it = threads.begin(); it != threads.end(); ++it) {
it->join();
}
} else {
trainThread(0);
}
model_ = std::make_shared<Model>(input_, output_, args_, 0);
saveModel();
if (args_->model != model_name::sup) {
saveVectors();
if (args_->saveOutput > 0) {
saveOutput();
}
}
}
区别就是调用了skipgram函数。其他流程一样:首先根据参数初始化模型,然后循环,读入一行,训练,隔一段时间更新一次learning rate。
void FastText::trainThread(int32_t threadId) {
std::ifstream ifs(args_->input);
utils::seek(ifs, threadId * utils::size(ifs) / args_->thread);
Model model(input_, output_, args_, threadId);
if (args_->model == model_name::sup) {
model.setTargetCounts(dict_->getCounts(entry_type::label));
} else {
model.setTargetCounts(dict_->getCounts(entry_type::word));
}
const int64_t ntokens = dict_->ntokens();
int64_t localTokenCount = 0;
std::vector<int32_t> line, labels;
while (tokenCount < args_->epoch * ntokens) {
real progress = real(tokenCount) / (args_->epoch * ntokens);
real lr = args_->lr * (1.0 - progress);
localTokenCount += dict_->getLine(ifs, line, labels, model.rng);
if (args_->model == model_name::sup) {
supervised(model, lr, line, labels);
} else if (args_->model == model_name::cbow) {
cbow(model, lr, line);
} else if (args_->model == model_name::sg) {
skipgram(model, lr, line);
}
if (localTokenCount > args_->lrUpdateRate) {
tokenCount += localTokenCount;
localTokenCount = 0;
if (threadId == 0 && args_->verbose > 1) {
printInfo(progress, model.getLoss());
}
}
}
if (threadId == 0 && args_->verbose > 0) {
printInfo(1.0, model.getLoss());
std::cerr << std::endl;
}
ifs.close();
}
这里是和文本分类的关键区别,循环。第一层是对一行的每个单词,第二层循环是对每个单词找到其周围的上下文,一个单词用vector去存是因为一个单词由多个subword组成
void FastText::skipgram(Model& model, real lr,
const std::vector<int32_t>& line) {
std::uniform_int_distribution<> uniform(1, args_->ws);
for (int32_t w = 0; w < line.size(); w++) {
int32_t boundary = uniform(model.rng);
const std::vector<int32_t>& ngrams = dict_->getSubwords(line[w]);
for (int32_t c = -boundary; c <= boundary; c++) {
if (c != 0 && w + c >= 0 && w + c < line.size()) {
model.update(ngrams, line[w + c], lr);
}
}
}
}