写在前面 : 自己之前很少做NLP相关的竞赛,所以其实不太懂现在的大语言模型,不过很幸运有大佬带飞。最终得到92.00分排第二名
赛题链接
简单讲这就是一个中文的分类任务,本身不是太难但是由于标注的原因和样本不均衡的原因得到高准确率还是不太容易。
我的代码只是在baseline上做了一些训练流程上的改进,最终的方案还是团队大佬做的大模型。抱着学习的心态记录一下这次经历。
打个广告 : 目前的话我没有固定的做竞赛队友,一直是单打独斗,希望能找几个队友一起做数据竞赛(CV, NLP,数据分析都可以),一起学习进步。缺队友的友友可以私信我哦(现在研一,计科专业)!
源码都开源了,有需要的可以看看。相关的权重参数文件,可以去huggingface下载。
大模型方案
bert方案
我的思路 : 模型选择经典的bert模型, 将train + val结合起来,发现样本的类别不均衡,所以做双阶段的训练,再做10折交叉验证舍弃掉准确率偏低的折数。这就是我的代码的一个简单方案。
数据处理
train+ val一共是7000条数据,test 有1000条,评价指标是准确率,所以我们团队的92分相当于还有80条左右预测错误。
双阶段训练数据构建
先将045合成为0类,构成第一阶段训练集
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import re
import random
src_path = "../data/train.txt"
dev_path = "../data/dev.txt"
df1 = pd.read_table(src_path, sep="\t", header=None)
df2 = pd.read_table(dev_path, sep="\t", header=None)
df3 = pd.concat([df1, df2], axis=0)
df4 = pd.DataFrame()
for i in range(len(df3)):
if df3[1][i] in [0, 4, 5]:
df4 = pd.concat([df4, df3.iloc[i]], axis=1)
df3[1][i] = 0
pd.DataFrame(df3[1]).value_counts()
单独读取045类,构成第二阶段的训练集
df4 = df4.T
df4 = df4.reset_index(drop=True)
data = []
for i in range(len(df4)):
if df4[1][i] in [0, 4, 5]:
data.append(df4.iloc[i])
pd.DataFrame(data)[1].value_counts()
pd.DataFrame(data).to_csv("../data/train2.txt", sep="\t", index=False, header = None)
数据增强
这个方案其实最后并没有采纳。具体思路是将每个类别的数据进行一个分词然后随机拼接不同样本里面的词语进行拼接组成新的样本,每个类别我是生成了300条数据
data0 = []
data1 = []
data2 = []
data3 = []
data4 = []
data5 = []
for i in range(7000):
if df3[1][i] == 0:
data0.append(df3[0][i])
elif df3[1][i] == 1:
data1.append(df3[0][i])
elif df3[1][i] == 2:
data2.append(df3[0][i])
elif df3[1][i] == 3:
data3.append(df3[0][i])
elif df3[1][i] == 4:
data4.append(df3[0][i])
elif df3[1][i] == 5:
data5.append(df3[0][i])
import jieba
all_text = []
for i in [data0, data1, data2, data3, data4, data5]:
text = []
for j in range(len(i)):
seg_list = jieba.lcut(i[j], cut_all=False) # cut_all=False 表示精确模式
text.append(seg_list)
all_text.append(text)
all_text[2]
all_cat_text = []
for i in range(6):
df_yhl = all_text[i]
all_data = []
for j in range(300):
text = []
random_integers = [random.randint(1, pd.DataFrame(df_yhl).shape[0]) for _ in range(5)]
for idx, n in enumerate(random_integers):
df_text = df_yhl[n-1]
if idx == 0:
text.append(df_text[0])
else:
random_integer = random.randint(1, pd.DataFrame(df_text).shape[1])
text.append(df_text[random_integer])
text = "".join(text)
all_data.append(text)
all_cat_text.append(all_data)
模型构建
加载数据
# 固定seed
seed_everything(42)
maxlen = 30
batch_size = 16
pretrained_dir = './'
config_path = pretrained_dir+'bert_config.json'
checkpoint_path = pretrained_dir+'pytorch_model.bin'
dict_path = pretrained_dir+'vocab.txt'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# 建立分词器
tokenizer = Tokenizer(dict_path, do_lower_case=True)
# 加载数据集
class MyDataset(ListDataset):
@staticmethod
def load_data(filenames):
"""加载数据,并尽量划分为不超过maxlen的句子
"""
D = []
seps, strips = u'\n。!?!?;;,, ', u';;,, '
for filename in filenames:
with open(filename, encoding='utf-8') as f:
for l in f:
text, label = l.strip().split('\t')
for t in text_segmentate(text, maxlen - 2, seps, strips):
D.append((t, int(label)))
return D
def collate_fn(batch):
batch_token_ids, batch_segment_ids, batch_labels = [], [], []
for text, label in batch:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
batch_token_ids.append(token_ids)
batch_segment_ids.append(segment_ids)
batch_labels.append([label])
batch_token_ids = torch.tensor(sequence_padding(batch_token_ids), dtype=torch.long, device=device)
batch_segment_ids = torch.tensor(sequence_padding(batch_segment_ids), dtype=torch.long, device=device)
batch_labels = torch.tensor(batch_labels, dtype=torch.long, device=device)
return [batch_token_ids, batch_segment_ids], batch_labels.flatten()
双阶段训练的第一阶段(进行7折交叉验证)
这里有点问题是 每折的模型的实例化应该放到循环外面, 但是我放到里面,需要改一下
# 定义bert上的模型结构
class Model(BaseModel):
def __init__(self) -> None:
super().__init__()
self.bert = build_transformer_model(config_path=config_path,
checkpoint_path=checkpoint_path,
with_pool=True)
self.dropout = nn.Dropout(0.1)
self.dense = nn.Linear(self.bert.configs['hidden_size'], 6)
def forward(self, token_ids, segment_ids):
_, pooled_output = self.bert([token_ids, segment_ids])
output = self.dropout(pooled_output)
output = self.dense(output)
return output
model = Model().to(device)
# 定义使用的loss和optimizer,这里支持自定义
model.compile(
loss=nn.CrossEntropyLoss(),
optimizer=optim.Adam(model.parameters(), lr=2e-5),
)
class Evaluator(Callback):
"""评估与保存
"""
def __init__(self):
self.best_val_acc = 0.
def on_epoch_end(self, global_step, epoch, logs=None):
val_acc = self.evaluate(valid_dataloader)
if val_acc >= self.best_val_acc:
self.best_val_acc = val_acc
model.save_weights('./best_model_1.pt')
print(f'val_acc: {val_acc:.5f}, best_val_acc: {self.best_val_acc:.5f}\n')
# 定义评价函数
def evaluate(self, data):
total, right = 0., 0.
for x_true, y_true in data:
y_pred = model.predict(x_true).argmax(axis=1)
total += len(y_true)
right += (y_true == y_pred).sum().item()
acc = right / total
return acc
def inference(texts):
'''单条样本推理
'''
ans = []
for text in texts:
token_ids, segment_ids = tokenizer.encode(text, maxlen=maxlen)
token_ids = torch.tensor(token_ids, dtype=torch.long, device=device)[None, :]
segment_ids = torch.tensor(segment_ids, dtype=torch.long, device=device)[None, :]
logit = model.predict([token_ids, segment_ids])
y_pred = torch.argmax(torch.softmax(logit, dim=-1)).cpu().numpy()
ans.append(y_pred)
return ans
def predict(file_path,output_path):
f = open(file_path, 'r', encoding='utf-8')
test_datas = f.readlines()
test_datas = [data.split('\t')[0] for data in test_datas]
results = inference(test_datas)
f.close()
fw = open(output_path, 'w', encoding='utf-8')
for i in range(len(test_datas)):
fw.write(f"{test_datas[i]}\t{results[i]}\n")
fw.close()
state = [1, 42, 100, 142, 500, 1200, 2023]
for i in range(7):
src_path = "../data/train1.txt"
df1 = pd.read_table(src_path, sep="\t", header=None)
shuffled_df1 = df1.sample(frac=1, random_state=state[i])
shuffled_df1.reset_index(drop=True, inplace=True)
train_df = shuffled_df1[:6500]
dev_df = shuffled_df1[6500:]
dev_df.reset_index(drop=True, inplace=True)
train_df.to_csv("../data/one_times_data/train/" + str(i) + ".txt", sep="\t",
index=False, header = None)
dev_df.to_csv("../data/one_times_data/dev/" + str(i) + ".txt", sep="\t",
index=False, header = None)
for i in range(7):
train_dataloader = DataLoader(MyDataset(['../data/one_times_data/train/' + str(i) + ".txt"]), batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
valid_dataloader = DataLoader(MyDataset(['../data/one_times_data/dev/' + str(i) + ".txt"]), batch_size=batch_size, collate_fn=collate_fn)
evaluator = Evaluator()
model.fit(train_dataloader, epochs=10, steps_per_epoch=None, callbacks=[evaluator])
predict('../data/pred.txt', '../output/one_times/' + str(i) + ".txt")
根据第一阶段的预测结果, 提取预测为0的样本组成新的test
sub_path = "../one_times/"
net_data_index = []
for i in range(7):
df = pd.read_csv(sub_path + str(i) + ".txt", sep="\t", header= None, )
data_index = []
net_data = []
for j in range(len(df)):
net_data.append(df[0][j])
data_index.append(df[1][j])
net_data_index.append(data_index)
# 对每个位置的数字进行投票
votes = []
for i in range(len(data_index)):
current_votes = [lst[i] for lst in net_data_index]
counter = Counter(current_votes)
winning_number = counter.most_common(1)[0][0] # 默认是如果全部不相同取第一个
votes.append(winning_number)
# 输出最终的投票结果
print("最终投票结果:", votes)
one_time = [net_data, votes]
pd.DataFrame(one_time).T.to_csv("submit1.txt" , sep="\t", index=False, header = None)
df = pd.read_table("submit1.txt", sep="\t", header= None)
print(pd.DataFrame(df[1]).value_counts())
data = []
for i in range(len(df)):
if df[1][i] == 0:
data.append(df.iloc[i])
print(pd.DataFrame(data)[1].value_counts())
pd.DataFrame(data).to_csv("../data/pred2.txt", sep="\t", index=False, header = None)
第二阶段就是将前面构造的045类别新训练集做训练去预测第一阶段最后生成的pred2.txt文件,代码同上。
我在跑这个模型得到大概89, 远不如大模型结果。感觉nlp相关的竞赛还是无脑上大模型就完了。