python实现rm_Python torch.renorm方法代码示例

# 需要导入模块: import torch [as 别名]

# 或者: from torch import renorm [as 别名]

def load_word2vec_format(filename, word_idx, binary=False, normalize=False,

encoding='utf8', unicode_errors='ignore'):

"""

refer to gensim

load Word Embeddings

If you trained the C model using non-utf8 encoding for words, specify that

encoding in `encoding`.

:param filename :

:param word_idx :

:param binary : a boolean indicating whether the data is in binary word2vec format.

:param normalize:

:param encoding :

:param unicode_errors: errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.

"""

vocab = set()

print("loading word embedding from %s" % filename)

with open(filename, 'rb') as fin:

# header = to_unicode(fin.readline(), encoding=encoding)

# vocab_size, vector_size = map(int, header.split()) # throws for invalid file format

vocab_size = 1917494

vector_size = 300

word_matrix = torch.zeros(len(word_idx), vector_size)

def add_word(_word, _weights):

if _word not in word_idx:

return

vocab.add(_word)

word_matrix[word_idx[_word]] = _weights

if binary:

binary_len = np.dtype(np.float32).itemsize * vector_size

for _ in range(vocab_size):

# mixed text and binary: read text first, then binary

word = []

while True:

ch = fin.read(1)

if ch == b' ':

break

if ch != b'\n': # ignore newlines in front of words (some binary files have)

word.append(ch)

word = to_unicode(b''.join(word), encoding=encoding, errors=unicode_errors)

weights = torch.from_numpy(np.fromstring(fin.read(binary_len), dtype=REAL))

add_word(word, weights)

else:

for line_no, line in enumerate(fin):

parts = to_unicode(line.rstrip(), encoding=encoding, errors=unicode_errors).split(" ")

if len(parts) != vector_size + 1:

raise ValueError("invalid vector on line %s (is this really the text format?)" % line_no)

word, weights = parts[0], list(map(float, parts[1:]))

weights = torch.Tensor(weights)

add_word(word, weights)

if word_idx is not None:

assert (len(word_idx), vector_size) == word_matrix.size()

if normalize:

# each row normalize to 1

word_matrix = torch.renorm(word_matrix, 2, 0, 1)

print("loaded %d words pre-trained from %s with %d" % (len(vocab), filename, vector_size))

return word_matrix, vector_size, vocab

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值