Skip-gram和CBOW简单区别
Skip-gram:词预测上下文
CBOW:上下文预测词
- 输入一个词,预测下一个词
- 一个词预测多个词
CBOW 多个词预测一个词
3. 欧氏距离
5. 余弦相似度
TF-idf
其中nd等于文档总数,df(t)为包含该词条的文档数
例子:
#训练word2vector代码
!echo '数据集行数:'
!wc -l 'bioCorpus_5000.txt'
!echo '======'
!echo '数据集前10行'
!head -10 'bioCorpus_5000.txt'
### 2.2. Word2vec 训练
import nltk
from gensim.models import word2vec
# 用生成器的方式读取文件里的句子
# 适合读取大容量文件,而不用加载到内存
class MySentences(object):
def __init__(self, fname):
self.fname = fname
def __iter__(self):
for line in open(self.fname,'r'):
yield line.split()
# 模型训练函数
def w2vTrain(f_input, model_output):
sentences = MySentences(DataDir+f_input)
w2v_model = word2vec.Word2Vec(sentences,
min_count = MIN_COUNT,
workers = CPU_NUM,
size = VEC_SIZE,
window = CONTEXT_WINDOW
)
w2v_model.save(ModelDir+model_output)
# 训练
DataDir = "./"
ModelDir = "./"
MIN_COUNT = 4
CPU_NUM = 2 # 需要预先安装 Cython 以支持并行
VEC_SIZE = 20
CONTEXT_WINDOW = 5 # 提取目标词上下文距离最长5个词
f_input = "bioCorpus_5000.txt"
model_output = "test_w2v_model"
w2vTrain(f_input, model_output)
### 2.3. 查看结果
# 加载模型
w2v_model = word2vec.Word2Vec.load(ModelDir+model_output)
w2v_model.most_similar('body') # 结果一般
w2v_model.most_similar('heart') # 结果太差
# 数据集不够大时,停止词太多,解决方法:去除停止词
# 停止词
from nltk.corpus import stopwords
StopWords = stopwords.words('english')
StopWords[:20]
# 重新训练
# 模型训练函数
def w2vTrain_removeStopWords(f_input, model_output):
sentences = list(MySentences(DataDir+f_input))
for idx,sentence in enumerate(sentences):
sentence = [w for w in sentence if w not in StopWords]
sentences[idx]=sentence
w2v_model = word2vec.Word2Vec(sentences, min_count = MIN_COUNT,
workers = CPU_NUM, size = VEC_SIZE)
w2v_model.save(ModelDir+model_output)
w2vTrain_removeStopWords(f_input, model_output)
w2v_model = word2vec.Word2Vec.load(ModelDir+model_output)
w2v_model.most_similar('heart') # 结果一般
结果:
[('formation', 0.9698415994644165),
('blood', 0.965557336807251),
('metabolism', 0.9630600214004517),
('changes', 0.9596229195594788),
('study', 0.9578454494476318),
('brain', 0.9577423334121704),
('liver', 0.957546591758728),
('synthesis', 0.9570505619049072),
('method', 0.9542899131774902),
('renal', 0.9542509913444519)]
TextRank
https://www.cnblogs.com/motohq/p/11887420.html