train_data = [(vocab.convert_tokens_to_ids(sentence), tag_vocab.convert_tokens_to_ids(tags)) for sentence, tags in zip(sents[:3000], postags[:3000])] test_data = [(vocab.convert_tokens_to_ids(sentence), tag_vocab.convert_tokens_to_ids(tags)) for sentence, tags in zip(sents[3000:], postags[3000:])]
# 这个函数就是将其变成等长,填充使用<pad>,至于是0还是1还是其他值并不重要,因为还有mask~ defcollate_fn(examples): lengths = torch.tensor([len(ex[0]) for ex in examples]) inputs = [torch.tensor(ex[0]) for ex in examples] targets = [torch.tensor(ex[1]) for ex in examples] inputs = pad_sequence(inputs, batch_first=True, padding_value=vocab["<pad>"]) targets = pad_sequence(targets, batch_first=True, padding_value=vocab["<pad>"]) return inputs, lengths, targets, inputs != vocab["<pad>"]
acc = 0 total = 0 for batch in tqdm(test_data_loader, desc=f"Testing"): inputs, lengths, targets, mask = [x.to(device) for x in batch] with torch.no_grad(): output = model(inputs, lengths) acc += (output.argmax(dim=-1) == targets)[mask].sum().item() total += mask.sum().item()
defload_sentence_polarity(): nltk.set_proxy('http://192.168.0.28:1080') nltk.download('sentence_polarity') from nltk.corpus import sentence_polarity
vocab = Vocab.build(sentence_polarity.sents())
train_data = [(vocab.convert_tokens_to_ids(sentence), 0) for sentence in sentence_polarity.sents(categories='pos')[:4000]] \ + [(vocab.convert_tokens_to_ids(sentence), 1) for sentence in sentence_polarity.sents(categories='neg')[:4000]]
test_data = [(vocab.convert_tokens_to_ids(sentence), 0) for sentence in sentence_polarity.sents(categories='pos')[4000:]] \ + [(vocab.convert_tokens_to_ids(sentence), 1) for sentence in sentence_polarity.sents(categories='neg')[4000:]]