torchtext 使用案例

构造数据集

root = 'dataset/Long-document-dataset/csv/'
cls_map = {n:i for i,n in enumerate(os.listdir(root))}
print(cls_map)
corpus = []
for P in os.listdir(root):
    p = root+P+'/'+P+'/'
    print(p)
    for fp in os.listdir(p):
        with open(p+fp) as f:
            text = ' '.join(f.readlines())
        corpus.append([text,[cls_map[P]]])
random.shuffle(corpus)

构造Field、Dataset、Iterator

from torchtext.vocab import GloVe
from torchtext.data import Example, BucketIterator, Iterator

tokenize = lambda x: x.split()
MAX_LEN = 100
TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=100)
LABEL = data.Field(sequential=False, use_vocab=False)

class MyDataset(data.Dataset):
    def __init__(self, csv_data, text_field, label_field, test=False, aug=False, **kwargs):
        
#         csv_data = pd.read_csv(csv_path) #其实是list:[[text,labeb], ...]
        fields = [("id", None),("text", text_field), ("label", label_field)]
        
        examples = []
        for text in csv_data:
#             examples.append(data.Example.fromlist([None, text[0][:MAX_LEN], text[1] if not test else None], fields))
            examples.append(data.Example.fromlist([None, text[0][:1000], text[1]], fields))
                
        # 上面是一些预处理操作,此处调用super调用父类构造方法,产生标准Dataset
        # super(MyDataset, self).__init__(examples, fields, **kwargs)
        super(MyDataset, self).__init__(examples, fields)

    def shuffle(self, text):
        # 序列随机排序
        text = np.random.permutation(text.strip().split())
        return ' '.join(text)

    def dropout(self, text, p=0.5):
        # 随机删除一些文本
        text = text.strip().split()
        len_ = len(text)
        indexs = np.random.choice(len_, int(len_ * p))
        for i in indexs:
            text[i] = ''
        return ' '.join(text)
    

def data_iter(TEXT, LABEL):
    
    train = MyDataset(corpus[:20000], text_field=TEXT, label_field=LABEL, test=False, aug=1)
    test = MyDataset(corpus[20000:], text_field=TEXT, label_field=LABEL, test=True, aug=1)#label_field=None
    # 传入用于构建词表的数据集
    # TEXT = data.Field(sequential=True, tokenize=tokenize, lower=True, fix_length=200)
    vectors = Vectors(name='/home1/lihaoyuan/data/NLP/glove/glove.6B.100d.txt', cache='.vector_cache')
    TEXT.build_vocab(train, vectors=vectors)
#     TEXT.build_vocab(train, vectors=GloVe(name='6B', dim=300)) # TEXT.build_vocab(train, vectors="glove.6B.200d")
    weight_matrix = TEXT.vocab.vectors
    print(weight_matrix.shape)
    # 只针对训练集构造迭代器
    # train_iter = data.BucketIterator(dataset=train, batch_size=8, shuffle=True, sort_within_batch=False, repeat=False)
    
    # 同时对训练集和验证集构造迭代器
#     train_iter, val_iter = data.BucketIterator.splits(
#             (train, valid),
#             batch_sizes=(8, 8),
#             # 如果使用gpu,此处将-1更换为GPU的编号
#             device=-1,
#             # 用来排序的指标
#             sort_key=lambda x: len(x.text),
#             sort_within_batch=False,
#             repeat=False
#     )
    train_iter = Iterator(train, batch_size=64, device=torch.device('cuda:0'), sort=False, sort_within_batch=False, repeat=False)
    test_iter = Iterator(test, batch_size=64, device=torch.device('cuda:0'), sort=False, sort_within_batch=False, repeat=False)
    return train_iter, test_iter, weight_matrix

train_iter, test_iter, weight_matrix = data_iter(TEXT, LABEL)

分类模型 

class RNN(nn.Module):

    def __init__(self):
        super(RNN, self).__init__()
        self.word_embeddings = nn.Embedding(len(TEXT.vocab), 100)  # embedding之后的shape: torch.Size([b, 100, 100])
        self.word_embeddings.weight.data.copy_(weight_matrix)
        self.lstm = nn.LSTM(input_size=100, hidden_size=128, bidirectional=True, num_layers=1)  # torch.Size([b, 100, 128])
        self.decoder = nn.Linear(256, 8)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        lstm_out = self.lstm(embeds)[0] 
#         final = lstm_out[-1]    # 取最后一个时间步
        final = lstm_out.mean(0)  # 平均
        y = self.decoder(final)  
        return y


acc_ = []
def main():
    model = RNN().cuda()
    model.train()
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001)
    loss_funtion = F.cross_entropy

    for epoch in range(10):
        for i, batch in enumerate(train_iter):
            optimizer.zero_grad()
            predicted = model(batch.text)

            loss = loss_funtion(predicted, batch.label.view(-1))
            loss.backward()
            optimizer.step()
            
            acc = (predicted.argmax(1) == batch.label.view(-1)).sum().float() / batch.label.size(0)
            acc_.append(acc.item())
        
        print('epoch:%d  loss:%.3f  acc:%.3f'%(epoch+1, loss.item(), np.mean(acc_)))
    
    model.eval()
    acc_test = []
    for i, batch in enumerate(test_iter):
        predicted = model(batch.text).argmax(1)
        acc = (predicted == batch.label.view(-1)).sum().float() / batch.label.size(0)
        acc_test.append(acc.item())
    print(np.mean(acc_test))


if __name__ == '__main__':
    main()

参考:

官方文档:https://torchtext.readthedocs.io/en/latest/vocab.html?highlight=vocab

https://blog.csdn.net/nlpuser/article/details/88067167 

https://www.cnblogs.com/linzhenyu/p/13277552.html

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值