重要参考
https://github.com/liuhuanyong/ChineseEmbedding
原文作者提供了字向量、拼音向量、词向量、词性向量与依存关系向量,共5种类型的向量训练,
在此,只取其字符向量训练部分,加以修改后,做一记录。
完整代码
train_vector.py
在此,设置字向量维度为100。
#!/usr/bin/env python3
# coding: utf-8
# File: train.py.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-10-26
import os
import gensim
from gensim.models import word2vec
from sklearn.decomposition import PCA
import numpy as np
import logging
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
class TrainVector:
def __init__(self):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
# 训练语料所在目录
self.token_filepath = os.path.join(cur, 'F:\浏览器下载\ChineseEmbedding-master\yuliao.txt')
# 向量文件所在目录
self.token_embedding = os.path.join(cur, 'F:\浏览器下载\ChineseEmbedding-master\\token_vec_300.bin')
#向量大小设置
self.token_size = 100
'''基于gensimx训练字符向量,拼音向量,词性向量'''
def train_vector(self, train_path, embedding_path, embedding_size):
sentences = word2vec.Text8Corpus(train_path) # 加载分词语料
model = word2vec.Word2Vec(sentences, size=embedding_size, window=5, min_count=0) # 训练skip-gram模型,默认window=5
model.wv.save_word2vec_format(embedding_path, binary=False)
'''训练主函数'''
def train_main(self):
#训练汉字字向量
self.train_vector(self.token_filepath, self.token_embedding, self.token_size)
if __name__ == '__main__':
handler = TrainVector()
handler.train_main()
得到的字向量文件如下:
test_vector.py
测试效果(主观测试,无精确指标)
#!/usr/bin/env python3
# coding: utf-8
# File: test_vector.py
# Author: lhy<lhy_in_blcu@126.com,https://huangyong.github.io>
# Date: 18-11-1
import os
import gensim
from gensim.models import word2vec
from sklearn.decomposition import PCA
import numpy as np
import logging
logging.basicConfig(format='%(asctime)s:%(levelname)s:%(message)s', level=logging.INFO)
class TrainVector:
def __init__(self):
cur = '/'.join(os.path.abspath(__file__).split('/')[:-1])
self.token_embedding = os.path.join(cur, 'F:\浏览器下载\ChineseEmbedding-master\\token_vec_300.bin')
'''对训练好的模型进行测试'''
def test_model(self, embedding_path):
model = gensim.models.KeyedVectors.load_word2vec_format(embedding_path, binary=False)
while (1):
wd = input('enter an word to search:')
result = model.most_similar(wd)
for res in result:
print(res)
return
if __name__ == '__main__':
handler = TrainVector()
handler.test_model('F:\浏览器下载\ChineseEmbedding-master\\token_vec_300.bin')