1、ChnSentiCorp_htl_all.csv数据集介绍
数据集地址: https://github.com/SophonPlus/ChineseNlpCorpus
中文酒店评论数据集,主要用于情感分析和文本分类任务,该数据集包含了超过7000条酒店评论,其中正向评论约5000条,负向评论约2000条。
数据集结构 数据集包含两列:
- label:情感标签,其中1表示正向评论,0表示负向评论。
- review:评论文本内容。
数据集特点:
- 均衡性:数据集中正向和负向评论的比例较为均衡,有助于模型训练时的性能提升。
- 多样性:评论内容涵盖了多种酒店服务的各个方面,能够全面反映消费者对酒店服务的不同意见和感受。
- 代表性:通过系统化的数据采集和分类,确保了数据的高质量和代表性。
应用场景:
- 情感分析:研究者可以利用该数据集训练和验证机器学习或深度学习模型,以识别和分析文本中的情感倾向。
- 文本分类:适用于自然语言处理中的文本分类任务,帮助模型学会区分不同情感倾向的文本。
2、hfl/rbt3模型介绍
RoBERTa-wwm-ext with Three Layers是一个专门为中文自然语言处理任务设计的预训练模型:
该模型旨在提升中文自然语言处理的效率,特别加强了对完整单词的识别,从而提高填空任务的准确性和语言理解能力。
模型特点:
- 全词掩蔽技术:RBT3采用了全词掩蔽技术,即在训练时不仅仅掩盖单个字,而是将整个词进行掩盖。这种方法让模型更好地理解词汇的完整意义,提高了模型在填空任务中的表现。
- 三层结构:RBT3模型由三层双向Transformer编码器组成,相比标准的BERT模型,层数较少,参数量也相对较少,这使得模型更加轻量且计算效率更高。<br><br>
- 开源与资源丰富:RBT3由专业团队在开源基础上开发,支持fill-mask任务,并提供了多种资源以支持后续研究。例如,可以利用TextBrewer工具在该模型中实现知识蒸馏,进一步扩展其应用潜力。
3、文本分类任务(原始)
#--------------------
# 1. 加载数据
#--------------------
import pandas as pd
data = pd.read_csv(r"D:\bigmodel_data\ChnSentiCorp_htl_all.csv") # 本地数据
# 数据清洗,删除空数据
data = data.dropna()
#--------------------
# 2. 创建Dataset
#--------------------
from torch.utils.data import Dataset
class MyDataset(Dataset):
def __init__(self):
super().__init__()
# 通过pandas加载数据
self.data = pd.read_csv(r"D:\bigmodel_data\ChnSentiCorp_htl_all.csv")
self.data = data.dropna()
def __getitem__(self, index):
# iloc 可以通过指定行和列的整数索引来选择数据
return self.data.iloc[index]['review'], self.data.iloc[index]['label']
def __len__(self):
return len(self.data)
dataset = MyDataset()
# dataset[0] # ('距离川沙公路较近,但是公交指示不对,如果是"蔡陆线"的话,会非常麻烦.建议用别的路线.房间较为简单.', 1)
#--------------------
# 3. 划分数据集
#--------------------
from torch.utils.data import random_split
train_data, valid_data = random_split(dataset=dataset,lengths=[0.8, 0.2])
# len(train_data),len(valid_data)
#--------------------
# 4. 创建DataLoader
#--------------------
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(r"D:\bigmodel_data\rbt3")
def collate_func(batch):
texts, labels = [], []
for item in batch:
texts.append(item[0])
labels.append(item[1])
inputs = tokenizer(text=texts, max_length=128, padding="max_length", truncation=True, return_tensors="pt")
inputs["labels"] = torch.tensor(labels)
return inputs
train_loader = DataLoader(dataset=train_data, batch_size=32, shuffle=True, collate_fn=collate_func)
valid_loader = DataLoader(dataset=valid_data, batch_size=64, shuffle=False, collate_fn=collate_func)
# next(enumerate(valid_loader))[1]
#--------------------
# 5. 创建模型及优化器
#--------------------
from transformers import AutoModelForSequenceClassification
from torch.optim import Adam
model = AutoModelForSequenceClassification.from_pretrained(r"D:\bigmodel_data\rbt3")
if torch.cuda.is_available():
model = model.cuda()
optimizer = Adam(model.parameters(), lr=2e-5)
#--------------------
# 6. 训练和验证
#--------------------
def evaluate():
model.eval()
acc_sum = 0
with torch.inference_mode():
for batch in valid_loader:
if torch.cuda.is_available():
batch = {k:v.cuda() for k,v in batch.items()}
output = model(**batch)
pred = torch.argmax(output.logits, dim=-1)
acc_sum += (pred.long() == batch["labels"].long()).float().sum()
return acc_sum/len(valid_loader)
def train(epochs=3):
for epoch in range(epochs):
train_sum_loss = 0
model.train()
for batch in train_loader:
if torch.cuda.is_available():
batch = {k: v.cuda() for k, v in batch.items()}
optimizer.zero_grad()
output = model(**batch)
output.loss.backward()
optimizer.step()
train_sum_loss += output.loss.item()
acc = evaluate()
train_avg_loss = train_sum_loss/len(train_loader)
print(f"ep: {epoch}, acc: {acc}, loss:{train_avg_loss}")
train()
#--------------------
# 7. 模型预测
#--------------------
sen = ["这家酒店环境不错!饭很好吃。","这家酒店环境不错,但是太贵了,我不会再来了"]
id2label = {0: "差评!", 1: "好评!"}
model = AutoModelForSequenceClassification.from_pretrained(r"D:\bigmodel_data\rbt3")
if torch.cuda.is_available():
model = model.cuda()
tokenizer = AutoTokenizer.from_pretrained(r"D:\bigmodel_data\rbt3")
model.eval()
#该方法与torch.no_grad()类似,都不会记录梯度
with torch.inference_mode():
inputs = tokenizer(sen, max_length=128,padding=True,truncation=True,return_tensors="pt")
inputs = {k: v.cuda() for k, v in inputs.items()}
logits = model(**inputs).logits
pred = torch.argmax(logits, dim=-1)
for i, p in enumerate(pred):
print(f"输入:{sen[i]}\n模型预测结果:{id2label.get(p.item())}")
4、文本分类+datasets(全参微调)
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification,DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import DataLoader
data_path = r"D:\bigmodel_data\ChnSentiCorp_htl_all.csv"
model_path = r"D:\bigmodel_data\rbt3"
# 加载数据
data = load_dataset("csv",data_files=data_path,split="all")
# 数据清洗
data = data.filter(lambda x:x["review"] is not None)
# 拆分数据集
data = data.train_test_split(test_size=0.2)
# 获取分词器
tokenizer = AutoTokenizer.from_pretrained(model_path)
# 定义处理函数,进行词嵌入,截断
def process_fun(example):
tokenized_example = tokenizer(example['review'], max_length=128, truncation=True)
tokenized_example['label'] = example['label']
return tokenized_example
# 调用函数,移除原始列
tokenized_data = data.map(
process_fun,
batched=True,
remove_columns=data['train'].column_names
)
# 数据加载器DataLoader,数据整理器DataCollator
train_data,test_data = tokenized_data["train"],tokenized_data["test"]
train_loader = DataLoader(train_data,batch_size=64,shuffle=True,collate_fn=DataCollatorWithPadding(tokenizer,padding=True))
test_loader = DataLoader(test_data,batch_size=64,shuffle=False,collate_fn=DataCollatorWithPadding(tokenizer,padding=True))
# next(enumerate(test_loader))[1]
# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(model_path)
if torch.cuda.is_available():
model = model.cuda()
# 全参微调,模型的每一层都需要更新
for param in model.parameters():
param.requires_grad = True
# 局部微调 model head,输出层,几个全连接层
# for name, param in model.named_parameters():
# if "classifier" not in name:
# param.requires_grad = False
# 优化器
optimizer = torch.optim.Adam(model.parameters(),lr=2e-5)
# 验证函数
def evaluate():
model.eval()
acc_num = 0
with torch.inference_mode():
for batch in test_loader:
inputs = {k:v.cuda() for k,v in batch.items()}
outputs = model(**inputs)
pred = torch.argmax(outputs.logits, dim=-1)
labels = batch["labels"].cuda()
acc_num += (labels.long() == pred.long()).sum()
return acc_num
# 训练函数
def train(epochs=3):
model.train()
global_step = 0
for epoch in range(epochs):
for batch in test_loader:
inputs = {k:v.cuda() for k,v in batch.items()}
outputs = model(**inputs)
loss = outputs.loss
# BP
optimizer.zero_grad()
loss.backward()
optimizer.step()
acc = evaluate()
print(f"训练轮次:{epoch+1}, 训练损失:{loss:.4f}, 验证准确数量:{acc}")
train()
# 验证
texts=["这家酒店环境不错,但卫生做的不太到位,希望进一步完善!",
"环境一般",
"很满意的一次入住"]
id2label = {0: "差评", 1: "好评"}
model.eval()
with torch.inference_mode():
inputs = tokenizer(text=texts, max_length=128, padding=True, truncation=True, return_tensors="pt")
# cuda
inputs = {k: v.cuda() for k, v in inputs.items()}
outputs = model(**inputs)
pred = torch.argmax(outputs.logits,dim=-1)
for i,p in enumerate(pred):
label = id2label.get(p.item())
print(f"输入为:{texts[i]}, 输出结果:{label}")
5、文本分类+datasets+evaluate
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification,DataCollatorWithPadding
from datasets import load_dataset
from torch.utils.data import DataLoader
import evaluate
data_path = r"D:\bigmodel_data\ratings_2_0_1_test.csv"
model_path = r"D:\bigmodel_data\rbt3"
# 加载数据
data = load_dataset("csv",data_files=data_path,split="all")
# 数据清洗
data = data.filter(lambda x:x["comment"] is not None)
# 拆分数据集
data = data.train_test_split(test_size=0.2)
# 获取分词器
tokenizer = AutoTokenizer.from_pretrained(model_path)
# 定义处理函数,进行词嵌入,截断
def process_fun(example):
tokenized_example = tokenizer(example['comment'], max_length=128, truncation=True)
tokenized_example['labels'] = [0 if e <= 3 else 1 for e in example['rating']]
return tokenized_example
# 调用函数,移除原始列
tokenized_data = data.map(
process_fun,
batched=True,
remove_columns=data['train'].column_names
)
# 数据加载器DataLoader,数据整理器DataCollator
train_data,test_data = tokenized_data["train"],tokenized_data["test"]
train_loader = DataLoader(train_data,batch_size=64,shuffle=True,collate_fn=DataCollatorWithPadding(tokenizer,padding=True))
test_loader = DataLoader(test_data,batch_size=64,shuffle=False,collate_fn=DataCollatorWithPadding(tokenizer,padding=True))
# next(enumerate(test_loader))[1]
# 加载模型
model = AutoModelForSequenceClassification.from_pretrained(model_path)
if torch.cuda.is_available():
model = model.cuda()
# 全参微调,模型的每一层都需要更新
for param in model.parameters():
param.requires_grad = True
# 局部微调 model head,输出层,几个全连接层
# for name, param in model.named_parameters():
# if "classifier" not in name:
# param.requires_grad = False
# 优化器
optimizer = torch.optim.Adam(model.parameters(),lr=2e-5)
clsf_metrics = evaluate.combine(["accuracy", "precision", "recall", "f1"])
# 验证函数
def evaluate():
model.eval()
acc_num = 0
with torch.inference_mode():
for batch in test_loader:
inputs = {k:v.cuda() for k,v in batch.items()}
outputs = model(**inputs)
pred = torch.argmax(outputs.logits, dim=-1)
labels = batch["labels"].cuda()
clsf_metrics.add_batch(
references=labels.long(),
predictions=pred.long()
)
return clsf_metrics.compute()
# 训练函数
def train(epochs=3):
model.train()
global_step = 0
for epoch in range(epochs):
for batch in test_loader:
inputs = {k:v.cuda() for k,v in batch.items()}
outputs = model(**inputs)
loss = outputs.loss
# BP
optimizer.zero_grad()
loss.backward()
optimizer.step()
global_step+=1
if global_step % 100 == 0:
print(f"轮次:{epoch+1}, 损失:{loss:.4f}")
acc = evaluate()
print(f"训练轮次:{epoch+1}, 训练损失:{loss:.4f}, 评价指标:{acc}")
train()
# 验证
texts=["这家酒店环境不错,但卫生做的不太到位,希望进一步完善!",
"环境一般",
"很满意的一次入住"]
id2label = {0: "差评", 1: "好评"}
model.eval()
with torch.inference_mode():
inputs = tokenizer(text=texts, max_length=128, padding=True, truncation=True, return_tensors="pt")
# cuda
inputs = {k: v.cuda() for k, v in inputs.items()}
outputs = model(**inputs)
pred = torch.argmax(outputs.logits,dim=-1)
for i,p in enumerate(pred):
label = id2label.get(p.item())
print(f"输入为:{texts[i]}, 输出结果:{label}")
6、⭐文本分类+datasets+evaluate重写+Trainer+TrainingArguments
重写evaluate评估函数,使用训练器Trainer,训练参数TrainingArguments
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from transformers import DataCollatorWithPadding
from torch.optim import Adam
import torch
import evaluate
# 加载数据集
data_path = r"D:\bigmodel_data\ratings_2_0_1_test.csv"
dataset = load_dataset("csv", data_files=data_path, split='all')
dataset = dataset.filter(lambda x: x['comment'] is not None)
train_data = dataset.train_test_split(test_size=0.2)
# 加载分词器
model_path = r"D:\bigmodel_data\rbt3"
tokenizer = AutoTokenizer.from_pretrained(model_path)
# 数据处理函数
def process_fun(examples):
tokenized_examples = tokenizer(examples['comment'], max_length=128, truncation=True)
tokenized_examples['labels'] = [0 if e <= 3 else 1 for e in examples['rating']]
return tokenized_examples
# 对数据集进行分词处理
tokenized_dataset = train_data.map(
process_fun,
batched=True,
remove_columns=train_data['train'].column_names)
# 加载模型和优化器
model = AutoModelForSequenceClassification.from_pretrained(model_path).cuda()
opt = Adam(model.parameters(), lr=1e-5)
# 定义评估指标
clf_metrics = evaluate.combine(["accuracy", "f1", "recall", "precision"])
# 重新定义 evaluate 函数
def evaluate(eval_pred):
logits, labels = eval_pred
predictions = torch.argmax(torch.tensor(logits), dim=-1)
references = torch.tensor(labels)
clf_metrics.add_batch(
references=references.long(),
predictions=predictions.long())
return clf_metrics.compute()
# 定义训练参数
from transformers import TrainingArguments
train_args = TrainingArguments(output_dir="checkpoint", # 指定模型和训练日志的保存路径
per_device_eval_batch_size=128, # 指定每个设备(如 GPU)在评估时的批次大小
per_device_train_batch_size=64, # 指定每个设备在训练时的批次大小
logging_steps=100, # 每隔多少步记录一次日志
evaluation_strategy='epoch', # 指定评估策略为每个 epoch 结束时进行评估。也可以设置为 steps
save_strategy='epoch', # 指定模型保存策略为每个 epoch 结束时保存模型
num_train_epochs=4, # 指定训练的 epoch 数量
save_total_limit=2, # 指定最多保存的模型检查点数量
learning_rate=1e-5, # 指定学习率
weight_decay=0.01, # 指定权重衰减(L2 正则化)系数,用于防止模型过拟合。
metric_for_best_model='f1', # 指定用于选择最佳模型的评估指标
load_best_model_at_end=True) # 指定在训练结束时加载最佳模型
# 创建 Trainer 对象
from transformers import Trainer
trainer = Trainer(
model=model,
args=train_args,
train_dataset=tokenized_dataset['train'],
eval_dataset=tokenized_dataset['test'],
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
compute_metrics=evaluate # 评估函数
)
# 训练模型
trainer.train()
# 评估模型
trainer.evaluate(tokenized_dataset["test"])
# 进行推理
from transformers import pipeline
texts = ["这本书真不错!","这本书太枯燥了!","这书的故事情节还行。"]
id2label = {0: "差评!", 1: "好评!"}
model.eval()
model.config.id2label = id2label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
for text in texts:
print(text)
print(pipe(text))
7、加载训练后的模型
需要将原模型的词汇表、tokenizer相关配置文件拷贝进训练保存的权重文件夹
# 进行推理
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained("checkpoint/checkpoint-792")
tokenizer = AutoTokenizer.from_pretrained("checkpoint/checkpoint-792")
texts = ["这本书真不错!", "这本书太枯燥了!", "这书的故事情节还行。"]
id2label = {0: "差评!", 1: "好评!"}
model.eval()
model.config.id2label = id2label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
for text in texts:
print(text)
print(pipe(text))
8、文本分类(训练新的数据集,多分类问题)
model = AutoModelForSequenceClassification.from_pretrained(model_path,num_labels=5).cuda()
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from datasets import load_dataset
from torch.utils.data import DataLoader
from transformers import DataCollatorWithPadding
from torch.optim import Adam
import torch
import evaluate
# 加载数据集
data_path = r"D:\bigmodel_data/data.json"
dataset = load_dataset("json", data_files=data_path, split='all')
dataset = dataset.filter(lambda x: x['text'] is not None)
train_data = dataset.train_test_split(test_size=0.2)
# 加载分词器
model_path = r"D:\bigmodel_data/rbt3"
tokenizer = AutoTokenizer.from_pretrained(model_path)
# 数据处理函数
label2id = {"diabetes": 0, "hypertension": 1, "hepatitis": 2, "aids": 3, "breast_cancer": 4}
def process_fun(examples):
tokenized_examples = tokenizer(examples['text'], max_length=128, truncation=True)
tokenized_examples['labels'] = [label2id[label] for label in examples['label']]
return tokenized_examples
# 对数据集进行分词处理
tokenized_dataset = train_data.map(
process_fun,
batched=True,
remove_columns=train_data['train'].column_names)
# 创建数据加载器
trainset, validset = tokenized_dataset['train'], tokenized_dataset['test']
# 加载模型和优化器
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=5).cuda()
opt = Adam(model.parameters(), lr=1e-5)
# 定义评估指标(指定 average='macro' 适用于多分类)
accuracy_metric = evaluate.load(r"D:\bigmodel_code\metrics/accuracy.py")
f1_metric = evaluate.load(r"D:\bigmodel_code\metrics/f1.py")
recall_metric = evaluate.load(r"D:\bigmodel_code\metrics/recall.py")
precision_metric = evaluate.load(r"D:\bigmodel_code\metrics/precision.py")
# 重新定义 evaluate 函数
def evaluate_model(eval_pred):
logits, labels = eval_pred
predictions = torch.argmax(torch.tensor(logits), dim=-1)
references = torch.tensor(labels)
# 计算各个指标
accuracy = accuracy_metric.compute(predictions=predictions, references=references)
f1 = f1_metric.compute(predictions=predictions, references=references, average="macro")
recall = recall_metric.compute(predictions=predictions, references=references, average="macro")
precision = precision_metric.compute(predictions=predictions, references=references, average="macro")
# 返回所有指标
return {"accuracy": accuracy["accuracy"],
"f1": f1["f1"],
"recall": recall["recall"],
"precision": precision["precision"]}
# 定义训练参数
from transformers import TrainingArguments
train_args = TrainingArguments(output_dir="checkpoint",
per_device_eval_batch_size=128,
per_device_train_batch_size=64,
logging_steps=100,
evaluation_strategy='epoch',
save_strategy='epoch',
num_train_epochs=10, # 训练轮次
save_total_limit=3, # 最大保存数量
learning_rate=1e-5,
weight_decay=0.01, # 权重衰减
metric_for_best_model='f1',
load_best_model_at_end=True,
greater_is_better=True)
# 创建 Trainer 对象
from transformers import Trainer
trainer = Trainer(
model=model,
args=train_args,
train_dataset=tokenized_dataset['train'],
eval_dataset=tokenized_dataset['test'],
data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
compute_metrics=evaluate_model
)
# 训练模型
trainer.train()
# 评估模型
# trainer.evaluate(tokenized_dataset["test"])
# 进行推理
from transformers import pipeline
texts = ["血糖控制方法有哪些?", "如何降低血压?","肝炎有哪些传播途径?" ,"如何预防艾滋病?" ,"乳腺癌早期症状是什么?",
"如何保持血糖水平稳定?","日常生活中如何有效控制血压?","常见病毒性肝病有哪些预防措施?","如何降低免疫系统受损的风险?","女性如何进行乳腺自我检查?"]
# id2label = {0:"diabetes", 1:"hypertension", 2:"hepatitis", 3:"aids", 4:"breast_cancer"}
id2label = {0: "糖尿病", 1: "高血压", 2: "肝炎", 3: "艾滋病", 4: "乳腺癌"}
model.eval()
model.config.id2label = id2label
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0)
for text in texts:
print(f"{text} -----> {pipe(text)}")
# print(pipe(text))