1、摘要说明
1.1、声明
*自己喂数据,迭代自己数据模型。不细讲理论只着重实现,理论请阅读文档。
1.2、NLP
NLP旨在让计算机能够理解、分析和生成人类语言,即通过计算机对自然语言的形、音、义等信息进行处理,包括字、词、句、篇章的输入、输出、识别、分析、理解、生成等操作和加工。简单来说,NLP的目标是让计算机能够“理解”我们的语言。
1.3、BERT
BERT的核心思想是通过在大规模语料库上进行无监督的预训练,学习到丰富的语言表示,然后将这些预训练好的表示应用于各种NLP任务中,通常只需要在任务特定的数据上进行微调(fine-tuning)即可达到很好的效果。
BERT文档:
https://huggingface.co/docs/transformers/model_doc/bert
BERT-demo:
https://github.com/google-research/bert
2、分类文本数据
2.1、描述
此例为文本三层分类,众多数据中已经描述信息训练对应层级推理的模型。
例如:
第一层模型以描述和第一层结果进行训练。后续如果自己训练多为单次层使用第一层既可。
第二层模型以描述和第一、第二层结果共同结果,训练出第二层模型。
第三层模型以描述和第一、第二、第三层结果共同结果,训练出第三层模型。
2.2、数据格式
此数据为模拟数据,仅展示格式概要。并将其命名为【分层结果.xlsx】
3、训练
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2024/7/9
# @Author : CCM
# @Describe : 文本三分类,层次递进嵌套
import pandas as pd
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, pipeline
import torch
import joblib
from sklearn.model_selection import train_test_split
class HierarchicalClassifier:
def __init__(self, excel_file='分层结果.xlsx', model_dir='bert-base-uncased'):
self.excel_file = excel_file
self.model_dir = model_dir
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.tokenizer = BertTokenizer.from_pretrained(model_dir)
self.classifiers = {}
self.label_encoders = {}
def read_excel(self):
data = pd.read_excel(self.excel_file)
data = data[['问题描述', '第一层', '第二层', '第三层']]
data = data.dropna()
return data
def create_model(self, num_labels):
return BertForSequenceClassification.from_pretrained(self.model_dir, num_labels=num_labels)
def preprocess_function(self, examples, text_column):
return self.tokenizer(examples[text_column], truncation=True, padding='max_length', max_length=512)
def train_and_save_model(self, train_dataset, test_dataset, label_encoder, output_dir):
encoded_train_dataset = train_dataset.map(lambda e: self.preprocess_function(e, '问题描述'), batched=True)
encoded_test_dataset = test_dataset.map(lambda e: self.preprocess_function(e, '问题描述'), batched=True)
"""
篇幅原因需要私信获取
"""
joblib.dump(label_encoder, f'{output_dir}/label_encoder.pkl')
def train_first_layer(self, data):
label_col = '第一层'
label_encoder = LabelEncoder()
data.loc[:, 'label'] = label_encoder.fit_transform(data[label_col])
train_data, test_data = train_test_split(data[['问题描述', label_col]], test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
output_dir = './results_first_layer'
self.train_and_save_model(train_dataset, test_dataset, label_encoder, output_dir)
self.classifiers[1] = pipeline('text-classification', model=output_dir, tokenizer=output_dir)
self.label_encoders[1] = label_encoder
def train_second_layer(self, data):
label_col = '第二层'
prev_label_encoder = self.label_encoders[1]
data = data[data['第一层'].isin(prev_label_encoder.classes_)]
label_encoder = LabelEncoder()
data.loc[:, 'label'] = label_encoder.fit_transform(data[label_col])
train_data, test_data = train_test_split(data[['问题描述', '第一层', label_col]], test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
output_dir = './results_second_layer'
self.train_and_save_model(train_dataset, test_dataset, label_encoder, output_dir)
self.classifiers[2] = pipeline('text-classification', model=output_dir, tokenizer=output_dir)
self.label_encoders[2] = label_encoder
def train_third_layer(self, data):
label_col = '第三层'
prev_label_encoder = self.label_encoders[2]
data = data[
(data['第一层'].isin(prev_label_encoder.classes_)) &
(data['第二层'].isin(self.label_encoders[1].classes_))
]
label_encoder = LabelEncoder()
data.loc[:, 'label'] = label_encoder.fit_transform(data[label_col])
train_data, test_data = train_test_split(data[['问题描述', '第一层', '第二层', label_col]], test_size=0.2, random_state=42)
train_dataset = Dataset.from_pandas(train_data)
test_dataset = Dataset.from_pandas(test_data)
output_dir = './results_third_layer'
self.train_and_save_model(train_dataset, test_dataset, label_encoder, output_dir)
self.classifiers[3] = pipeline('text-classification', model=output_dir, tokenizer=output_dir)
self.label_encoders[3] = label_encoder
def predict_hierarchy(self, text):
predicted_labels = {}
for level in range(1, 4):
classifier = self.classifiers[level]
label_encoder = self.label_encoders[level]
result = classifier(text)
predicted_label = label_encoder.inverse_transform([int(result[0]['label'].split('_')[1])])[0]
predicted_labels[f'第{level}层'] = predicted_label
return predicted_labels
def train_all_layers(self):
data = self.read_excel()
self.train_first_layer(data)
self.train_second_layer(data)
self.train_third_layer(data)
def test_inference(self):
text = "这是一部很棒的电影!"
predicted_labels = self.predict_hierarchy(text)
print(f"推理结果: {predicted_labels}")
# 使用示例
if __name__ == "__main__":
classifier = HierarchicalClassifier()
classifier.train_all_layers()
classifier.test_inference()
4、推理
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time : 2024/7/9
# @Author : CCM
# @Describe : 推理文本三分类模型
import time
import joblib
from transformers import pipeline
# 加载 LabelEncoder
label_encoder_first = joblib.load('./models_ccm/label_encoder_first.pkl')
label_encoder_second = joblib.load('./models_ccm/label_encoder_second.pkl')
label_encoder_third = joblib.load('./models_ccm/label_encoder_third.pkl')
# 加载模型和 tokenizer
classifier_first = pipeline(
'text-classification',
model="./models_ccm/custom_saved_model_first",
tokenizer="./models_ccm/custom_saved_model_first"
)
classifier_second = pipeline(
'text-classification',
model="./models_ccm/custom_saved_model_second",
tokenizer="./models_ccm/custom_saved_model_second"
)
classifier_third = pipeline(
'text-classification',
model="./models_ccm/custom_saved_model_third",
tokenizer="./models_ccm/custom_saved_model_third"
)
# 推理函数
def predict_hierarchy(text):
# 第一层预测
result_first = classifier_first(text)
predicted_first = label_encoder_first.inverse_transform([int(result_first[0]['label'].split('_')[1])])[0]
# 第二层预测
result_second = classifier_second(text)
predicted_second = label_encoder_second.inverse_transform([int(result_second[0]['label'].split('_')[1])])[0]
# 第三层预测
result_third = classifier_third(text)
predicted_third = label_encoder_third.inverse_transform([int(result_third[0]['label'].split('_')[1])])[0]
return predicted_first, predicted_second, predicted_third
# 示例调用
if __name__ == "__main__":
t1 = time.time()
text = "营销活动问题要求相关业务,请尽快联系处理。"
first_layer, second_layer, third_layer = predict_hierarchy(text)
res_dict = {"第一层": first_layer, "第二层": second_layer, "第三层": third_layer, "时间": time.time() - t1}
print(res_dict)
5、总结
如果出现过拟合或欠拟合等情况,合理调整相关参数。参考信息如下:
训练
batch_size seq_len Time per batch (eager - s) Time per batch (sdpa - s) Speedup (%) Eager peak mem (MB) sdpa peak mem (MB) Mem saving (%) 4 256 0.023 0.017 35.472 939.213 764.834 22.800 4 512 0.023 0.018 23.687 1970.447 1227.162 60.569 8 256 0.023 0.018 23.491 1594.295 1226.114 30.028 8 512 0.035 0.025 43.058 3629.401 2134.262 70.054 16 256 0.030 0.024 25.583 2874.426 2134.262 34.680 16 512 0.064 0.044 46.223 6964.659 3961.013 75.830 推理
batch_size seq_len Per token latency eager (ms) Per token latency SDPA (ms) Speedup (%) Mem eager (MB) Mem BT (MB) Mem saved (%) 1 128 5.736 4.987 15.022 282.661 282.924 -0.093 1 256 5.689 4.945 15.055 298.686 298.948 -0.088 2 128 6.154 4.982 23.521 314.523 314.785 -0.083 2 256 6.201 4.949 25.303 347.546 347.033 0.148 4 128 6.049 4.987 21.305 378.895 379.301 -0.107 4 256 6.285 5.364 17.166 443.209 444.382 -0.264