【超实用!】用PyTorch和Transformer玩转自然语言处理!‍✨

Hey小伙伴们!👋 今天要和大家分享一个超酷的项目:用PyTorch和Transformer来处理文本分类任务!如果你对自然语言处理感兴趣,或者正在寻找一个实践项目来提升你的技能,那么这篇教程绝对不容错过!👩‍💻✨

📚 项目背景

Transformers是近年来自然语言处理领域的一大突破,它们能够有效地处理长序列数据,并且在很多任务上取得了非常好的效果。在这个项目中,我们将使用一个简单的Transformer模型来对文本进行情感分析!

💻 环境准备

首先,确保你安装了PyTorch和其他必需的库。可以使用pip来安装:

pip install torch transformers

📝 代码详解

接下来,我们一步步来实现这个项目吧!👇

1. 导入所需库

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

2. 数据预处理

这里我们使用一个简单的文本数据集,你可以替换为你自己的数据集。

# 假设我们有一些文本数据和对应的标签
texts = ["I love this movie!", "This is terrible.", "Absolutely amazing!", "What a waste of time."]
labels = [1, 0, 1, 0]  # 1代表正面评价,0代表负面评价

# 划分训练集和测试集
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# 加载预训练的BERT模型和分词器
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# 定义数据集类
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

# 创建数据加载器
def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    ds = TextDataset(texts, labels, tokenizer, max_len)
    return DataLoader(ds, batch_size=batch_size, num_workers=2)

# 定义参数
MAX_LEN = 128
BATCH_SIZE = 8
train_data_loader = create_data_loader(train_texts, train_labels, tokenizer, MAX_LEN, BATCH_SIZE)
test_data_loader = create_data_loader(test_texts, test_labels, tokenizer, MAX_LEN, BATCH_SIZE)

3. 训练模型

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 定义损失函数和优化器
loss_fn = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# 训练循环
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model = model.train()

    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# 测试模型
def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            loss = loss_fn(outputs.logits, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)

# 训练和测试
EPOCHS = 5
for epoch in range(EPOCHS):
    train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device)
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print(f'Train loss {train_loss} accuracy {train_acc}')

    test_acc, test_loss = eval_model(model, test_data_loader, loss_fn, device)
    print(f'Test loss {test_loss} accuracy {test_acc}')
    print('-' * 50)

完整代码如下:

import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# 假设我们有一些文本数据和对应的标签
texts = ["I love this movie!", "This is terrible.", "Absolutely amazing!", "What a waste of time."]
labels = [1, 0, 1, 0]  # 1代表正面评价,0代表负面评价

# 划分训练集和测试集
train_texts, test_texts, train_labels, test_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# 加载预训练的BERT模型和分词器
bert_model_path = 'huggingfaceLib/bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(bert_model_path)
model = AutoModelForSequenceClassification.from_pretrained(bert_model_path, num_labels=2)


# 定义数据集类
class TextDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        text = str(self.texts[item])
        label = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


# 创建数据加载器
def create_data_loader(texts, labels, tokenizer, max_len, batch_size):
    ds = TextDataset(texts, labels, tokenizer, max_len)
    return DataLoader(ds, batch_size=batch_size, num_workers=2)




# 训练循环
def train_epoch(model, data_loader, loss_fn, optimizer, device):
    model = model.train()

    losses = []
    correct_predictions = 0

    for d in data_loader:
        input_ids = d["input_ids"].to(device)
        attention_mask = d["attention_mask"].to(device)
        targets = d["labels"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        _, preds = torch.max(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        optimizer.zero_grad()

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


# 测试模型
def eval_model(model, data_loader, loss_fn, device):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        for d in data_loader:
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["labels"].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            _, preds = torch.max(outputs.logits, dim=1)

            loss = loss_fn(outputs.logits, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / len(data_loader.dataset), np.mean(losses)


if __name__ == '__main__':
    # 定义参数
    MAX_LEN = 128
    BATCH_SIZE = 8
    train_data_loader = create_data_loader(train_texts, train_labels, tokenizer, MAX_LEN, BATCH_SIZE)
    test_data_loader = create_data_loader(test_texts, test_labels, tokenizer, MAX_LEN, BATCH_SIZE)

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)

    # 定义损失函数和优化器
    loss_fn = nn.CrossEntropyLoss().to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

    # 训练和测试
    EPOCHS = 5
    for epoch in range(EPOCHS):
        train_acc, train_loss = train_epoch(model, train_data_loader, loss_fn, optimizer, device)
        print(f'Epoch {epoch + 1}/{EPOCHS}')
        print(f'Train loss {train_loss} accuracy {train_acc}')

        test_acc, test_loss = eval_model(model, test_data_loader, loss_fn, device)
        print(f'Test loss {test_loss} accuracy {test_acc}')
        print('-' * 50)


🏆 成果展示

通过训练,我们的模型能够在文本分类任务上取得不错的准确率!这不仅是一个很好的入门项目,也是进一步探索自然语言处理的好起点。如果你对这个项目有任何疑问,或者想要分享你的成果,欢迎留言讨论!喜欢我的请点赞,关注收藏我,我将带来更多人工智能相关知识👩‍💻✨
训练效果图:
在这里插入图片描述

#PyTorch #Transformer #自然语言处理 #情感分析

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值