词向量源码解析:(4.8)hyperwords源码解析之example_test

我们再看一下example_test.sh脚本,这个脚本能生成多种词向量,包括word2vec,PPMI,SVD在不同超参下面的词向量。流程就是先预处理,然后训练,或是计算出向量,最后用analogy和similarity任务去评估。

#!/bin/sh


# Download and install word2vecf#下载并且安装word2vecf
if [ ! -f word2vecf ]; then
    scripts/install_word2vecf.sh
fi




# Download corpus. We chose a small corpus for the example, and larger corpora will yield better results.#下载语料
wget http://www.statmt.org/wmt14/training-monolingual-news-crawl/news.2010.en.shuffled.gz
gzip -d news.2010.en.shuffled.gz
CORPUS=news.2010.en.shuffled


# Clean the corpus from non alpha-numeric symbols#清洗语料
scripts/clean_corpus.sh $CORPUS > $CORPUS.clean




# Create two example collections of word-context pairs:


# A) Window size 2 with "clean" subsampling#两种超参选择,这是A选择
mkdir w2.sub
python hyperwords/corpus2pairs.py --win 2 --sub 1e-5 ${CORPUS}.clean > w2.sub/pairs#从语料到单词对
scripts/pairs2counts.sh w2.sub/pairs > w2.sub/counts#从单词对到共现矩阵
python hyperwords/counts2vocab.py w2.sub/counts#从共现矩阵到中心词和上下文词典


# B) Window size 5 with dynamic contexts and "dirty" subsampling#两种超参选择,这是B选择
mkdir w5.dyn.sub.del
python hyperwords/corpus2pairs.py --win 5 --dyn --sub 1e-5 --del ${CORPUS}.clean > w5.dyn.sub.del/pairs
scripts/pairs2counts.sh w5.dyn.sub.del/pairs > w5.dyn.sub.del/counts
python hyperwords/counts2vocab.py w5.dyn.sub.del/counts


# Calculate PMI matrices for each collection of pairs
python hyperwords/counts2pmi.py --cds 0.75 w2.sub/counts w2.sub/pmi#从共现矩阵到PMI矩阵
python hyperwords/counts2pmi.py --cds 0.75 w5.dyn.sub.del/counts w5.dyn.sub.del/pmi




# Create embeddings with SVD
python hyperwords/pmi2svd.py --dim 500 --neg 5 w2.sub/pmi w2.sub/svd#从PMI矩阵到SVD
cp w2.sub/pmi.words.vocab w2.sub/svd.words.vocab#PMI和SVD共享中心词和上下文词典
cp w2.sub/pmi.contexts.vocab w2.sub/svd.contexts.vocab
python hyperwords/pmi2svd.py --dim 500 --neg 5 w5.dyn.sub.del/pmi w5.dyn.sub.del/svd
cp w5.dyn.sub.del/pmi.words.vocab w5.dyn.sub.del/svd.words.vocab
cp w5.dyn.sub.del/pmi.contexts.vocab w5.dyn.sub.del/svd.contexts.vocab




# Create embeddings with SGNS (A). Commands 2-5 are necessary for loading the vectors with embeddings.py#对情况A产生的pairs进行训练,用word2vecf
word2vecf/word2vecf -train w2.sub/pairs -pow 0.75 -cvocab w2.sub/counts.contexts.vocab -wvocab w2.sub/counts.words.vocab -dumpcv w2.sub/sgns.contexts -output w2.sub/sgns.words -threads 10 -negative 15 -size 500;
python hyperwords/text2numpy.py w2.sub/sgns.words#把文本形式的词向量转成numpy数组的形式
rm w2.sub/sgns.words
python hyperwords/text2numpy.py w2.sub/sgns.contexts
rm w2.sub/sgns.contexts


# Create embeddings with SGNS (B). Commands 2-5 are necessary for loading the vectors with embeddings.py
word2vecf/word2vecf -train w5.dyn.sub.del/pairs -pow 0.75 -cvocab w5.dyn.sub.del/counts.contexts.vocab -wvocab w5.dyn.sub.del/counts.words.vocab -dumpcv w5.dyn.sub.del/sgns.contexts -output w5.dyn.sub.del/sgns.words -threads 10 -negative 15 -size 500;
python hyperwords/text2numpy.py w5.dyn.sub.del/sgns.words
rm w5.dyn.sub.del/sgns.words
python hyperwords/text2numpy.py w5.dyn.sub.del/sgns.contexts
rm w5.dyn.sub.del/sgns.contexts




# Evaluate on Word Similarity#最后是评估 先是similarity评估
echo
echo "WS353 Results"
echo "-------------"


python hyperwords/ws_eval.py --neg 5 PPMI w2.sub/pmi testsets/ws/ws353.txt
python hyperwords/ws_eval.py --eig 0.5 SVD w2.sub/svd testsets/ws/ws353.txt
python hyperwords/ws_eval.py --w+c SGNS w2.sub/sgns testsets/ws/ws353.txt


python hyperwords/ws_eval.py --neg 5 PPMI w5.dyn.sub.del/pmi testsets/ws/ws353.txt
python hyperwords/ws_eval.py --eig 0.5 SVD w5.dyn.sub.del/svd testsets/ws/ws353.txt
python hyperwords/ws_eval.py --w+c SGNS w5.dyn.sub.del/sgns testsets/ws/ws353.txt




# Evaluate on Analogies#然后是analogy评估
echo
echo "Google Analogy Results"
echo "----------------------"


python hyperwords/analogy_eval.py PPMI w2.sub/pmi testsets/analogy/google.txt
python hyperwords/analogy_eval.py --eig 0 SVD w2.sub/svd testsets/analogy/google.txt
python hyperwords/analogy_eval.py SGNS w2.sub/sgns testsets/analogy/google.txt


python hyperwords/analogy_eval.py PPMI w5.dyn.sub.del/pmi testsets/analogy/google.txt
python hyperwords/analogy_eval.py --eig 0 SVD w5.dyn.sub.del/svd testsets/analogy/google.txt
python hyperwords/analogy_eval.py SGNS w5.dyn.sub.del/sgns testsets/analogy/google.txt

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值