word_ngram 模型使用小tip

  1. 模型加载

import gensim.models
import pandas

#vocabulary 中不存在的单词计算,切分字符求平均计算余弦相似度 
# 本方法,使用wiki word+char模型,计算PKU 500词语相似,在wiki中的词汇
def cha_w2v(s):
    res = []
    for i in s:
        print(i, model[i])
        res.append(model[i])
    return sum(res)/len(res)

def Cosine(vec1, vec2):
    npvec1, npvec2 = np.array(vec1), np.array(vec2)
    return npvec1.dot(npvec2)/(math.sqrt((npvec1**2).sum()) * math.sqrt((npvec2**2).sum()))

    
def word2vec(word1, word2):
    import gensim
    model = gensim.models.KeyedVectors.load_word2vec_format(r'D:\paper\github\baike\sgns.target.word', binary=False)
    try:
        return model.wv.similarity(word1, word2)  # 两者的相似度
    except Exception as e:
        print(e)
            
def open_file(path, path1):
    result = []
    res = []
    with open(path, 'r') as f:
        result = f.readlines()
                 
    with open(path1, "r") as f1:
        res = f1.readlines()
#     print(result, res)
    for i in range(len(result)):
        print(word2vec(result[i].replace("\n", ""), res[i].replace("\n", "")))
if __name__ == "__main__":
    path = r"D:\three-graduate\github\PKU\test_0.txt"
    path1 = r"D:\three-graduate\github\PKU\test_1.txt"
    open_file(path, path1)

 

  1. 获取词向量

import gensim

word_bigram_model = gensim.models.Word2Vec.load(r'word_bigram_vec.model')
b, m = word_bigram_model['经济'], word_bigram_model['技术开发区']
print(word_bigram_model['经济'])

 返回值

2019-04-11 14:59:41,165: INFO: loading Word2Vec object from word_bigram_vec.model
2019-04-11 14:59:41,177: INFO: loading wv recursively from word_bigram_vec.model.wv.* with mmap=None
2019-04-11 14:59:41,178: INFO: setting ignored attribute syn0norm to None
2019-04-11 14:59:41,181: INFO: setting ignored attribute cum_table to None
2019-04-11 14:59:41,183: INFO: loaded word_bigram_vec.model
[-0.01684514  0.13455741  0.17175296 -0.15261555  0.14880194 -0.05852989
  0.11702891 -0.00645706  0.13129011  0.0037918  -0.07117661  0.04918306
 -0.00633776  0.16125411 -0.16449031  0.16797715 -0.08587097  0.11890551
  0.15654904  0.03720385 -0.09973465 -0.13967292  0.06865297  0.14942247
 -0.05423964  0.03057543  0.12359779  0.04466727 -0.1129114   0.06762148
 -0.00404186  0.11382089  0.1590517  -0.00739482  0.09887756 -0.021617
 -0.06795673 -0.04936986  0.17318445  0.13710393  0.09871282 -0.15885064
  0.02290574  0.10899743 -0.03361735 -0.13518198 -0.04432678  0.15015683
  0.09823871  0.03566907  0.16228081  0.01277325 -0.10990467 -0.10631494
  0.04612574 -0.15971538  0.0307724   0.15425161 -0.14897354 -0.01123148
 -0.0685416   0.16466613 -0.15720974 -0.01152455  0.06378336  0.01186667
  0.14268617  0.01728935  0.10357516  0.0513874  -0.14196669 -0.03539138
 -0.01240211  0.12300974  0.07864882 -0.07350471 -0.03537329 -0.05262757
 -0.09945282  0.12092489  0.02496581 -0.06897018 -0.12307417  0.03045487
 -0.04351559 -0.15707819 -0.11303623  0.09016802  0.17420939 -0.05190273
  0.01956015  0.13877028  0.04501417  0.06004856  0.02629445  0.05638432
  0.00707566  0.0308344   0.0324998   0.07068051]
  1. 计算词之间的相关性;

def word2vec(word1, word2):
    import gensim
    model = gensim.models.KeyedVectors.load_word2vec_format(r'D:\models\baidubaike\sgns.merge.word', binary=False)
    try:
        return model.wv.similarity(word1, word2)  # 两者的相似度
    except Exception as e:
        print(e)
            

 

  1. 针对不在语料库中的词,怎么处理?

.425220965886
0.366352303735
"word '地磁力' not in vocabulary"
None
  1. 新词发现

针对一些手机相关的特有词汇,如何做:比如AOD屏幕-灭屏显式、一些表达比较长的,等等,如何切成一个词

  1. 设置自定义语料库

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值