平台信息:
PC:ubuntu18.04、i5、anaconda2、cuda9.0、cudnn7.0.5、tensorflow1.10、GTX1060
一、将copy_train.csv文件的内容进行分词,生成process_copy_train.csv文件
1 import jieba 2 import re 3 import os 4 import sys 5 import gensim 6 import sys 7 8 9 f1 = open(u'copy_train.csv') 10 for line in f1.readlines(): 11 with open(u'process_copy_train2.csv','a') as f2: 12 seg_list = jieba.cut(line,cut_all=False) 13 seg_list = " ".join(seg_list) 14 seg_list.encode("utf8") 15 seg_list.decode("utf8") 16 f2.write(seg_list) 17 f2.write("\n") 18 #f2.write(line) 19 20 f2.close() 21 f1.close()
二、训练词汇表,并进行测试
1 import jieba 2 import re 3 import os 4 import sys 5 import gensim 6 import sys 7 8 9 10 from gensim.models import word2vec 11 12 reload(sys) 13 sys.setdefaultencoding('utf8') 14 15 sentences=word2vec.Text8Corpus(u'process_copy_train.csv') 16 model=word2vec.Word2Vec(sentences, size=50) 17 18 model[u'美元'.decode("utf-8")] 19 y2=model.similarity(u"美元", u"美国") 20 print(y2) 21 y2=model.similarity(u"美元", u"英镑") 22 print(y2) 23 y2=model.similarity(u"美元", u"美元") 24 print(y2) 25 26 for i in model.most_similar(u"银行"): 27 print i[0],i[1] 28 29 str4 = model.most_similar(u"银行".decode("utf-8")) 30 31 print str4 32 model.save('/tmp/word2vec_model') 33 34 new_model=gensim.models.Word2Vec.load('/tmp/word2vec_model')
测试结果:
1 0.21382438 2 0.65352416 3 1.0 4 商业银行 0.724080383778 5 券商 0.67235070467 6 同业 0.65898835659 7 银行业 0.640146613121 8 金融机构 0.628186702728 9 中资银行 0.624082624912 10 流动性 0.589600920677 11 中小银行 0.587715625763 12 行 0.576077103615 13 信贷 0.575850129128