import torch
import torch.nn as nn
import numpy as np
import re
word2id = np.load("./data/word2id.npy",allow_pickle=True).tolist()
glove = nn.Embedding(18766, 300)
emb = torch.from_numpy(np.load("./data/glove/glove_300d.npy"))
glove.weight.data.copy_(emb)
word_dim = 300
def get_glove( sentence_lists):
'''
get the glove word embedding vectors for a sentences
'''
max_len = max(map(lambda x: len(x), sentence_lists))
sentence_lists = list(map(lambda x: list(map(lambda w: word2id.get(w, 0), x)), sentence_lists))
#word to id
sentence_lists = list(map(lambda x: x + [18766-1] * (max_len - len(x)), sentence_lists))
#padding
sentence_lists = torch.LongTensor(sentence_lists)
embeddings = glove(sentence_lists)
#id to tensor
return embeddings
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
sentence1 = 'im a boy !'
sentence2 = 'in a girl'
input = ([clean_str(sentence1).split(),clean_str(sentence2).split()])
print(get_glove(input).shape)
print('over')
#n words sentence => n*dim matrix
两个文件地址文件地址