这篇博客是在之前的基础上构建完整的将 文本
-> 下标
->embedding
之前的完整代码
from torchtext import data
import torch
def split_tokenize(x): # 传入的x就是一个格子的文本
return x.split(' ') # 按空格切分
def country_tokenize(x): # 这是专门处理国家字段,不分词直接返回结果
return x # 比如传入United Kingdom,我们不要切分成['United', 'Kingdom'],这是干嘛呀,我们就直接返回国家名就行
NAME = data.Field() # 不传tokenize,默认会按照空格分开
COUNTRY = data.Field(tokenize=country_tokenize)
Comment = data.Field(tokenize=split_tokenize, init_token='<SOS>', eos_token='<EOS>')
fields = [('name', NAME), ('country', COUNTRY), ('age', None), ('comment', Comment)]
train_set = data.TabularDataset.splits(
path='./',
train='my_csv_text.csv',
format='csv',
skip_header=True,
fields=fields)
train_set = train_set[0]
print("分词结果:", train_set.examples[0].comment)
from gensim.models import Word2Vec
def train_word2vec(sentences):
model = Word2Vec(sentences, vector_size=256, min_count=1, window=5, sg=0)
return model
word2vec_model = train_word2vec([
train_set.examples[0].comment,
train_set.examples[1].comment,
]) # 这里训练了一个超级差劲的模型,但是意思就是这个意思
word2vec_model.wv.save_word2vec_format('my_word2vec_format.txt')
from torchtext.vocab import Vectors
vectors = Vectors(name='my_word2vec_format.txt')
Comment.build_vocab(train_set, vectors=vectors)
之前的这部分有任何疑问请参考:torchtext处理文本数据——使用自己的word2vec模型作为词向量建立词表(学习二)
将文本转为下标,将下标借助embedding转为向量
# =========================将文本转为下标=========================
for i in train_set:
print(i.comment)
print([Comment.vocab.stoi[_] for _ in i.comment])
# =========================使用embedding的案例=========================
import torch.nn as nn
embedding = nn.Embedding(18, 256)
# 指定嵌入矩阵的初始权重
embedding.weight.data.copy_(Comment.vocab.vectors)
for i in train_set:
print(i.comment)
index = [Comment.vocab.stoi[_] for _ in i.comment] # 这里将文本转为index
print(embedding(torch.Tensor(index).long())) # 根据 index得到embedding