word 2 vec 功能实现

最新推荐文章于 2021-03-27 11:05:39 发布

不争而善胜

最新推荐文章于 2021-03-27 11:05:39 发布

阅读量183

点赞数

分类专栏： python 统计 create

本文链接：https://blog.csdn.net/qq_42501075/article/details/105747877

版权

python 同时被 3 个专栏收录

56 篇文章 0 订阅

订阅专栏

统计

45 篇文章 0 订阅

订阅专栏

create

26 篇文章 0 订阅

订阅专栏

import torch
import torch.nn as nn
import numpy as np
import re

word2id = np.load("./data/word2id.npy",allow_pickle=True).tolist()
glove = nn.Embedding(18766, 300)
emb = torch.from_numpy(np.load("./data/glove/glove_300d.npy"))

glove.weight.data.copy_(emb)
word_dim = 300

def get_glove( sentence_lists):
    '''
    get the glove word embedding vectors for a sentences
    '''
    max_len = max(map(lambda x: len(x), sentence_lists))
    sentence_lists = list(map(lambda x: list(map(lambda w: word2id.get(w, 0), x)), sentence_lists))
    #word to id
    sentence_lists = list(map(lambda x: x + [18766-1] * (max_len - len(x)), sentence_lists))
    #padding
    sentence_lists = torch.LongTensor(sentence_lists)

    embeddings = glove(sentence_lists)
    #id to tensor
    return embeddings

def clean_str(string):
    """
    Tokenization/string cleaning for all datasets except for SST.
    Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
    """
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"\'s", " \'s", string)
    string = re.sub(r"\'ve", " \'ve", string)
    string = re.sub(r"n\'t", " n\'t", string)
    string = re.sub(r"\'re", " \'re", string)
    string = re.sub(r"\'d", " \'d", string)
    string = re.sub(r"\'ll", " \'ll", string)
    string = re.sub(r",", " , ", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", " \( ", string)
    string = re.sub(r"\)", " \) ", string)
    string = re.sub(r"\?", " \? ", string)
    string = re.sub(r"\s{2,}", " ", string)
    return string.strip().lower()

sentence1 = 'im a boy !'
sentence2 = 'in a girl'
input = ([clean_str(sentence1).split(),clean_str(sentence2).split()])
print(get_glove(input).shape)
print('over')
#n words sentence => n*dim matrix

两个文件地址文件地址

不争而善胜

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
word 2 vec 功能实现

import torchimport torch.nn as nnimport numpy as npimport reword2id = np.load("./data/word2id.npy",allow_pickle=True).tolist()glove = nn.Embedding(18766, 300)emb = torch.from_numpy(np.load("./d...
复制链接

扫一扫