安装
在github上找到源码下载编译即可
$ git clone http://github.com/stanfordnlp/glove
$ cd glove && make
$ ./demo.sh
执行顺序
GloVe代码包括了四个.c文件
- 首先执行vocab_count.c这个文件的功能是扫一遍语料,建立一个字典。
- 执行cooccur.c文件。它的功能是从语料中建立共现矩阵,GloVe是在共现矩阵上面进行训练的。所谓共现矩阵,在GloVe中是大量的三元组<eat, food, 150>,<of, the, 100000>…cooccur.c文件生成的三元组的顺序是根据词频排好序的。比如<of, the, 100000>在<eat, food, 150>前面,因为of的频数大于eat。<of, the, 100000>在<of, mine, 1000>前面,因为the的频数大于mine。
- 执行第三个文件shuffle.c。这个文件是用来打乱之前生成的共现矩阵(也就是打乱三元组的顺序)。
- 最后执行glove.c。它会在打乱顺序的三元组上面训练词向量。
评价指标
eval文件中给出了评估词向量的代码。评估的方法和word2vec是一模一样的。数据集也是一模一样的。只不过GloVe评估的代码是用python去写的。运行GloVe的话就执行demo.sh脚本。下面是demo.sh的代码,从中也可以看到我们上面说的GloVe执行的流程。
下面脚本会自动下载一些一个测试用的语料库,如果自己使用的话,可以选择去掉这一部分
#!/bin/bash
set -e
# Makes programs, downloads sample data, trains a GloVe model, and then evaluates it.
# One optional argument can specify the language used for eval script: matlab, octave or [default] python
make
if [ ! -e text8 ]; then
if hash wget 2>/dev/null; then
wget http://mattmahoney.net/dc/text8.zip
else
curl -O http://mattmahoney.net/dc/text8.zip
fi
unzip text8.zip
rm text8.zip
fi
# 注释掉以上
CORPUS=text8 #换成自己的语料库
VOCAB_FILE=vocab.txt
COOCCURRENCE_FILE=cooccurrence.bin
COOCCURRENCE_SHUF_FILE=cooccurrence.shuf.bin
BUILDDIR=build
SAVE_FILE=vectors
VERBOSE=2 #冗余输出参数 设置为2 可以查看程序运行进度
MEMORY=4.0
VOCAB_MIN_COUNT=5
VECTOR_SIZE=50
MAX_ITER=15
WINDOW_SIZE=15
BINARY=2
NUM_THREADS=8
X_MAX=10
echo
echo "$ $BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE"
$BUILDDIR/vocab_count -min-count $VOCAB_MIN_COUNT -verbose $VERBOSE < $CORPUS > $VOCAB_FILE
echo "$ $BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE"
$BUILDDIR/cooccur -memory $MEMORY -vocab-file $VOCAB_FILE -verbose $VERBOSE -window-size $WINDOW_SIZE < $CORPUS > $COOCCURRENCE_FILE
echo "$ $BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE"
$BUILDDIR/shuffle -memory $MEMORY -verbose $VERBOSE < $COOCCURRENCE_FILE > $COOCCURRENCE_SHUF_FILE
echo "$ $BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE"
$BUILDDIR/glove -save-file $SAVE_FILE -threads $NUM_THREADS -input-file $COOCCURRENCE_SHUF_FILE -x-max $X_MAX -iter $MAX_ITER -vector-size $VECTOR_SIZE -binary $BINARY -vocab-file $VOCAB_FILE -verbose $VERBOSE
if [ "$CORPUS" = 'text8' ]; then
if [ "$1" = 'matlab' ]; then
matlab -nodisplay -nodesktop -nojvm -nosplash < ./eval/matlab/read_and_evaluate.m 1>&2
elif [ "$1" = 'octave' ]; then
octave < ./eval/octave/read_and_evaluate_octave.m 1>&2
else
echo "$ python eval/python/evaluate.py"
python eval/python/evaluate.py
fi
fi