-
模型加载
import gensim.models
import pandas
#vocabulary 中不存在的单词计算,切分字符求平均计算余弦相似度
# 本方法,使用wiki word+char模型,计算PKU 500词语相似,在wiki中的词汇
def cha_w2v(s):
res = []
for i in s:
print(i, model[i])
res.append(model[i])
return sum(res)/len(res)
def Cosine(vec1, vec2):
npvec1, npvec2 = np.array(vec1), np.array(vec2)
return npvec1.dot(npvec2)/(math.sqrt((npvec1**2).sum()) * math.sqrt((npvec2**2).sum()))
def word2vec(word1, word2):
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format(r'D:\paper\github\baike\sgns.target.word', binary=False)
try:
return model.wv.similarity(word1, word2) # 两者的相似度
except Exception as e:
print(e)
def open_file(path, path1):
result = []
res = []
with open(path, 'r') as f:
result = f.readlines()
with open(path1, "r") as f1:
res = f1.readlines()
# print(result, res)
for i in range(len(result)):
print(word2vec(result[i].replace("\n", ""), res[i].replace("\n", "")))
if __name__ == "__main__":
path = r"D:\three-graduate\github\PKU\test_0.txt"
path1 = r"D:\three-graduate\github\PKU\test_1.txt"
open_file(path, path1)
-
获取词向量
import gensim
word_bigram_model = gensim.models.Word2Vec.load(r'word_bigram_vec.model')
b, m = word_bigram_model['经济'], word_bigram_model['技术开发区']
print(word_bigram_model['经济'])
返回值
2019-04-11 14:59:41,165: INFO: loading Word2Vec object from word_bigram_vec.model
2019-04-11 14:59:41,177: INFO: loading wv recursively from word_bigram_vec.model.wv.* with mmap=None
2019-04-11 14:59:41,178: INFO: setting ignored attribute syn0norm to None
2019-04-11 14:59:41,181: INFO: setting ignored attribute cum_table to None
2019-04-11 14:59:41,183: INFO: loaded word_bigram_vec.model
[-0.01684514 0.13455741 0.17175296 -0.15261555 0.14880194 -0.05852989
0.11702891 -0.00645706 0.13129011 0.0037918 -0.07117661 0.04918306
-0.00633776 0.16125411 -0.16449031 0.16797715 -0.08587097 0.11890551
0.15654904 0.03720385 -0.09973465 -0.13967292 0.06865297 0.14942247
-0.05423964 0.03057543 0.12359779 0.04466727 -0.1129114 0.06762148
-0.00404186 0.11382089 0.1590517 -0.00739482 0.09887756 -0.021617
-0.06795673 -0.04936986 0.17318445 0.13710393 0.09871282 -0.15885064
0.02290574 0.10899743 -0.03361735 -0.13518198 -0.04432678 0.15015683
0.09823871 0.03566907 0.16228081 0.01277325 -0.10990467 -0.10631494
0.04612574 -0.15971538 0.0307724 0.15425161 -0.14897354 -0.01123148
-0.0685416 0.16466613 -0.15720974 -0.01152455 0.06378336 0.01186667
0.14268617 0.01728935 0.10357516 0.0513874 -0.14196669 -0.03539138
-0.01240211 0.12300974 0.07864882 -0.07350471 -0.03537329 -0.05262757
-0.09945282 0.12092489 0.02496581 -0.06897018 -0.12307417 0.03045487
-0.04351559 -0.15707819 -0.11303623 0.09016802 0.17420939 -0.05190273
0.01956015 0.13877028 0.04501417 0.06004856 0.02629445 0.05638432
0.00707566 0.0308344 0.0324998 0.07068051]
-
计算词之间的相关性;
def word2vec(word1, word2):
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format(r'D:\models\baidubaike\sgns.merge.word', binary=False)
try:
return model.wv.similarity(word1, word2) # 两者的相似度
except Exception as e:
print(e)
-
针对不在语料库中的词,怎么处理?
.425220965886
0.366352303735
"word '地磁力' not in vocabulary"
None
- 新词发现
针对一些手机相关的特有词汇,如何做:比如AOD屏幕-灭屏显式、一些表达比较长的,等等,如何切成一个词
- 设置自定义语料库