import os
import gc
import time
import copy
import random
import string
import joblib
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import lr_scheduler
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GroupKFold, KFold
from transformers import AutoTokenizer , AutoModel, AutoConfig,AdamW
from transformers import DataCollatorWithPadding
import warnings
warnings.filterwarnings("ignore")
train_dir = '/kaggle/input/feedback-prize-effectiveness/train'
test_dir = '/kaggle/input/feedback-prize-effectiveness/test'
model_path = '/kaggle/input/deberta-v3-base/deberta-v3-base'
config = {
'seed':2022,
'epochs':3,
'model_name':"microsoft/deberta-v3-base",
'train_batch_size':8,
'vaild_batch_size':8,
'max_length':512,
'learning_rate':2e-5,
"scheduler":'CosineAnnealingLR',
'min_lr':1e-6,
"T_max":500,
"weight_decay":1e-6,
"n_fold":5,
"n_accumulate":1,
"num_classes":3,
"device":torch.device("cuda:0" if torch.cuda.is_available() else"cpu"),
"tokenizer":AutoTokenizer.from_pretrained(model_path)
}
config['tokenizer'] = AutoTokenizer.from_pretrained(model_path)
def set_seed(seed=42):
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.backends.cudnn.deterministic =True
torch.backends.cudnn.benchmark = False
os.environ['PYRHONHASHSEED'] = str(seed)
set_seed(config['seed'])
def get_essay(essay_id):
essay_path = os.path.join(train_dir,f'{essay_id}.txt')
essay_text= open(essay_path,'r').read()
return essay_text
df = pd.read_csv('/kaggle/input/feedback-prize-effectiveness/train.csv')
df['essay_text'] = df['essay_id'].apply(get_essay)
df.head()
gkf = GroupKFold(n_splits=config['n_fold'])
for fold,(_,val_) in enumerate(gkf.split(X=df,groups=df.essay_id)):
df.loc[val_,'kfold'] = int(fold)
df['kfold'] = df['kfold'].astype(int)
df.groupby('kfold')['discourse_effectiveness'].value_counts()
encoder = LabelEncoder()
df['discourse_effectiveness'] = encoder.fit_transform(df['discourse_effectiveness'])
with open('le,pkl','wb') as fp:
joblib.dump(encoder,fp)
class FeedBackDataset(Dataset):
def __init__(self, df, tokenizer, max_length):
self.df = df
self.max_len = max_length
self.tokenizer = tokenizer
self.discourse = df['discourse_text'].values
self.discourse_type = df['discourse_type'].values
self.essay = df['essay_text'].values
self.targets = df['discourse_effectiveness'].values
def __len__(self):
return len(self.df)
def __getitem__(self, index):
discourse = self.discourse[index]
discourse_type = self.discourse_type[index]
# Assuming you have an 'essay' attribute in your dataset
essay = self.essay[index] # 修正:将 eassy 改为 essay
text = discourse_type + self.tokenizer.sep_token + discourse + self.tokenizer.sep_token + " " + essay
inputs = self.tokenizer.encode_plus(text, truncation=True,
add_special_tokens=True,
max_length=self.max_len,
padding='max_length'
)
return {
'input_ids': inputs['input_ids'],
'attention_mask': inputs['attention_mask'], # 修正:将 input 改为 inputs
'target': self.targets[index] # 修正:需要根据实际情况获取 'target'
}
collate_fn = DataCollatorWithPadding(tokenizer=config['tokenizer'])
class MeanPooling(nn.Module):
def __init__(self):
super(MeanPooling, self).__init__()
def forward(self, last_hidden_state, attention_mask):
input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
sum_mask = input_mask_expanded.sum(1)
sum_mask = torch.clamp(sum_mask, min=1e-9)
mean_embeddings = sum_embeddings / sum_mask
return mean_embeddings
class FeedBackModel(nn.Module):
def __init__(self, model_name):
super(FeedBackModel, self).__init__()
self.model = AutoModel.from_pretrained(model_name)
self.config = AutoConfig.from_pretrained(model_name)
self.drop = nn.Dropout(p=0.2)
self.pooler = MeanPooling()
self.fc = nn.Linear(self.config.hidden_size,config['num_classes']) # 假设 'config' 在其他地方已经定义
def forward(self,ids,mask):
out = self.model(input_ids=ids, attention_mask=mask, output_hidden_states=False)
out = self.pooler(out.last_hidden_state, mask)
out = self.drop(out) # 修正:将 'drop' 更正为 'dropout'
outputs = self.fc(out)
return outputs # 修正:将 'outputs' 更正为 'output'
FeedBackModel(model_path)
def criterion(outputs,labels):
return nn.CrossEntropyLoss()(outputs,labels)
def train_one_epoch(model, optimizer, scheduler, dataloader, device, epoch):
model.train()
data_size = 0
running_loss = 0.0
bar = tqdm(enumerate(dataloader), total=len(dataloader))
for step, data in bar:
ids = data['input_ids'].to(device, dtype=torch.long) # 修正拼写错误
mask = data['attention_mask'].to(device, dtype=torch.long)
targets = data['target'].to(device, dtype=torch.long)
batch_size = ids.size(0)
outputs = model(ids, mask)
loss = criterion(outputs, targets) # 修正拼写错误
loss = loss / config['n_accumulate'] # 修正拼写错误
loss.backward()
if (step + 1) % config['n_accumulate'] == 0:
optimizer.step()
optimizer.zero_grad()
if scheduler is not None:
scheduler.step()
running_loss += (loss.item() * batch_size)
data_size += batch_size
epoch_loss = running_loss / data_size
bar.set_postfix(Epoch=epoch, Train_Loss=epoch_loss, LR=optimizer.param_groups[0]['lr'])
gc.collect()
return epoch_loss
@torch.no_grad()
def valid_one_epoch(model, dataloader, device, epoch): # 修正:将 vaild_one_epoch 改为 valid_one_epoch
model.eval()
dataset_size = 0
running_loss = 0.0
bar = tqdm(enumerate(dataloader), total=len(dataloader)) # 修正:将 tottal 改为 total
for step, data in bar:
ids = data['input_ids'].to(device, dtype=torch.long) # 修正:将 inputs_ids 改为 input_ids
mask = data['attention_mask'].to(device, dtype=torch.long)
targets = data['target'].to(device, dtype=torch.long)
batch_size = ids.size(0)
outputs = model(ids, mask)
loss = criterion(outputs, targets) # 修正:将 output 改为 outputs
running_loss += (loss.item() * batch_size)
dataset_size += batch_size
epoch_loss = running_loss / dataset_size
bar.set_postfix(Epoch=epoch, Valid_Loss=epoch_loss) # 修正:将 Train_Loss 改为 Valid_Loss
gc.collect()
return epoch_loss
def run_training(model, optimizer, scheduler, device, num_epochs, fold):
if torch.cuda.is_available():
print('[INFO] Using GPU: {}\n'.format(torch.cuda.get_device_name())) # 修正拼写错误:将 get_device_namae() 改为 get_device_name()
start = time.time()
best_model_wts = copy.deepcopy(model.state_dict())
best_epoch_loss = np.inf
history = defaultdict(list) # 修正:需要等号而不是冒号
for epoch in range(1, num_epochs + 1):
gc.collect()
train_epoch_loss = train_one_epoch(model, optimizer, scheduler, train_loader, device, epoch) # 修正:需要传入 optimizer 和 scheduler
val_epoch_loss = valid_one_epoch(model, valid_loader, device, epoch) # 修正:将 vaild_one_epoch 改为 valid_one_epoch
history['Train Loss'].append(train_epoch_loss)
history['Valid Loss'].append(val_epoch_loss)
if val_epoch_loss <= best_epoch_loss:
print(f"Validation Loss Improved: {best_epoch_loss} ---- {val_epoch_loss}")
best_epoch_loss = val_epoch_loss
best_model_wts = copy.deepcopy(model.state_dict())
PATH = f'Loss-Fold-{fold}.bin' # 修正:添加文件名中的 fold
torch.save(model.state_dict(), PATH)
print(f'Model Saved')
print()
end = time.time()
time_elapsed = end - start # 修正拼写错误:将 time_clapsed 改为 time_elapsed
minutes, seconds = divmod(time_elapsed, 60)
hours, minutes = divmod(minutes, 60)
print(f"Time elapsed: {int(hours)} hours, {int(minutes)} minutes, {seconds:.2f} seconds")
model.load_state_dict(best_model_wts)
return model, history
def prepare_loaders(fold):
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)
print(config)
train_dataset = FeedBackDataset(df_train, tokenizer=config['tokenizer'], max_length=config['max_length'])
valid_dataset = FeedBackDataset(df_valid, tokenizer=config['tokenizer'], max_length=config['max_length']) # 使用 df_valid 来创建验证数据集
train_loader = DataLoader(train_dataset, batch_size=config['train_batch_size'], collate_fn=collate_fn, num_workers=8, shuffle=True, pin_memory=False, drop_last=True)
valid_loader = DataLoader(valid_dataset, batch_size=config['train_batch_size'], collate_fn=collate_fn, num_workers=8, shuffle=True, pin_memory=False, drop_last=True)
return train_loader, valid_loader
def fetch_scheduler(optimizer):
if config['scheduler'] == 'CosineAnnealingLR':
scheduler = lr_scheduler.CosineAnnealingLR(optimizer, T_max=config['T_max'], eta_min=config['min_lr'])
elif config['scheduler'] == 'CosineAnnealingWarmRestarts':
scheduler = lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=config['T_0'], eta_min=config['min_lr'])
elif config['scheduler'] is None:
return None
return scheduler
for fold in range(config['n_fold']): # 修正:使用 range 函数迭代折叠
print(f'======Fold: {fold}======') # 修正:添加空格并将 Fold 改为 fold
train_loader, valid_loader = prepare_loaders(fold=fold)
model = FeedBackModel(model_path)
model.to(config['device'])
optimizer = AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay']) # 修正:修正参数名称和格式
scheduler = fetch_scheduler(optimizer) # 修正:此处调用学习率调度的代码需要根据你的具体情况进行实现
model, history = run_training(model, optimizer, scheduler, device=config['device'], num_epochs=config['epochs'], fold=fold)
del model,history,train_loader,valid_loader
_ = gc.collect
print()
第一次参加kaggle的baseline
最新推荐文章于 2024-10-17 00:00:00 发布