word 2 vec 功能实现

45 篇文章 0 订阅
26 篇文章 0 订阅
import torch
import torch.nn as nn
import numpy as np
import re

word2id = np.load("./data/word2id.npy",allow_pickle=True).tolist()
glove = nn.Embedding(18766, 300)
emb = torch.from_numpy(np.load("./data/glove/glove_300d.npy"))

glove.weight.data.copy_(emb)
word_dim = 300

def get_glove( sentence_lists):
    '''
    get the glove word embedding vectors for a sentences
    '''
    max_len = max(map(lambda x: len(x), sentence_lists))
    sentence_lists = list(map(lambda x: list(map(lambda w: word2id.get(w, 0), x)), sentence_lists))
    #word to id
    sentence_lists = list(map(lambda x: x + [18766-1] * (max_len - len(x)), sentence_lists))
    #padding
    sentence_lists = torch.LongTensor(sentence_lists)

    embeddings = glove(sentence_lists)
    #id to tensor
    return embeddings

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

sentence1 = 'im a boy !'
sentence2 = 'in a girl'
input = ([clean_str(sentence1).split(),clean_str(sentence2).split()])
print(get_glove(input).shape)
print('over')
#n words sentence => n*dim matrix



两个文件地址文件地址

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值