import torch
import torch. nn as nn
from torch. nn. utils. rnn import pad_sequence
from torch. utils. data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from sklearn. model_selection import train_test_split
from transformers import BertTokenizer, BertConfig
from transformers import AdamW, BertForSequenceClassification, get_linear_schedule_with_warmup
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib. pyplot as plt
device = torch. device( 'cuda' if torch. cuda. is_available( ) else 'cpu' )
!nvidia- smi
df = pd. read_csv( r'dataset/cola_public/raw/in_domain_train.tsv' , encoding= 'utf-8' , delimiter= '\t' , header= None , names= [ 'sentence_source' , 'label' , 'label_notes' , 'sentence' ] )
df. shape
df. sample( 10 )
sentences = df[ 'sentence' ] . values
sentences = [ "[CLS]" + sentence + "[SEP]" for sentence in sentences]
labels = df[ 'label' ] . values
sentences
tokenizer = BertTokenizer. from_pretrained( r'model/bert-base-chinese' , do_lower_case= True )
tokenized_texts = [ tokenizer. encode( sent, add_special_tokens= True ) for sent in sentences]
print ( "Tokenize the first sentence:" )
print ( tokenized_texts[ 0 ] )
MAX_LEN = 128
input_ids = torch. tensor( [
tokens + [ tokenizer. pad_token_id] * ( MAX_LEN - len ( tokens) ) if len ( tokens) < MAX_LEN else tokens[ : MAX_LEN]
for tokens in tokenized_texts
] )
print ( "填充后的输入形状:" , input_ids. shape)
print ( input_ids)
attention_masks = [ ]
for seq in input_ids:
seq_mask = [ float ( i> 0 ) for i in seq]
attention_masks