本次跟着Coggle 30 Days of ML学习算法竞赛,而不是基于现成baseline来操作,预计重新熟悉并学习学习文本分类和关键词提取。
数据挖掘比赛为“汽车领域多语种迁移学习挑战赛”,比赛报名地址:2022 iFLYTEK A.I.开发者大赛-讯飞开放平台
以下为打卡内容:
- 任务1:报名比赛
- 步骤1:报名比赛2022 iFLYTEK A.I.开发者大赛-讯飞开放平台
- 步骤2:下载比赛数据(点击比赛页面的赛题数据)
- 步骤3:解压比赛数据,并使用pandas进行读取;
- 步骤4:查看训练集和测试集字段类型,并将数据读取代码写到博客;
import pandas as pd
cn_train = pd.read_csv('中文_trian.csv')
jp_train = pd.read_csv('日语_train.csv')
en_train = pd.read_csv('英文_train.csv')
jp_test = pd.read_excel('testA.xlsx',sheet_name = '日语_testA')
en_test = pd.read_excel('testA.xlsx',sheet_name = '英文_testA')
jp_train = jp_train.iloc[: ,0:5]
# 分词
import jieba
import nagisa # 日文分词
def cutwords(sentence):
result = jieba.lcut(sentence)
return result
cn_train['words'] = cn_train['原始文本'].apply(lambda x:' '.join(cutwords(x)))
jp_train['words'] = jp_train['原始文本'].apply(lambda x: ' '.join(nagisa.tagging(x).words))
en_train['words'] = en_train['原始文本'].apply(lambda x:x.lower())
jp_test['words'] = jp_test['原始文本'].apply(lambda x: ' '.join(nagisa.tagging(x).words))
en_test['words'] = en_test['原始文本'].apply(lambda x:x.lower())
-
任务3:TFIDF与文本分类
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
#划分验证集训练集
data = jp_train.append(en_train,ignore_index=True)
y = data['意图'].values
xtrain, xvalid, ytrain, yvalid = train_test_split(data.words.values, y,
stratify=y,
random_state=42,
test_size=0.2, shuffle=True)
print(xtrain.shape,xvalid.shape,ytrain.shape,yvalid.shape)
#提取TF-IDF特征
tfv = TfidfVectorizer(use_idf = True)
model = tfv.fit(jp_train['words'].tolist()+en_train['words'].tolist())
xtrain_tfv = tfv.transform(jp + en)
xvalid_tfv = tfv.transform(jp_valid + en_valid)
#LR模型
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='acc')
clf.fit(xtrain_tfv)
predictions = clf.predict(xvalid_tfv)
from sklearn.metrics import accuracy_score
accuracy_score(yvalid, predictions) #验证集 准确率90%
#预测
jptest_tfv = tfv.transform(jp_test.words.values)
entest_tfv = tfv.transform(en_test.words.values)
jp_predictions = clf.predict(jptest_tfv)
en_predictions = clf.predict(entest_tfv)
import numpy as np
jp_submit = jp_test
jp_submit['意图']=jp_predictions
jp_submit['槽值1']=np.nan
jp_submit['槽值2']=np.nan
jp_submit.drop(['words'],axis = 1,inplace =True)
en_submit = en_test
en_submit['意图']=en_predictions
en_submit['槽值1']=np.nan
en_submit['槽值2']=np.nan
en_submit.drop(['words'],axis = 1,inplace =True)
# 写入提交文件
writer = pd.ExcelWriter('yuliao_submit.xlsx')
en_submit.to_excel(writer, sheet_name='英文_testA', index=None)
jp_submit.to_excel(writer, sheet_name='日语_testA', index=None)
writer.save()
writer.close()
分数只有0.59,一方面TD-IDF在这么小的数据集上效果还是没那么好,另一方面结果的槽值1,2都没有填写。
-
任务4:正则表达式
- 步骤1:学习使用正则表达式,并提取文本中的连续数值;
- 步骤2:使用正则表达式进行槽值匹配(基于历史的槽值字符串)
import re
#匹配连续出现的任意字符
def get_lianxu(word):
return re.findall("(.)\\1{1}", word)
jp_train['re'] = jp_train['原始文本'].apply(get_lianxu)
jp_train['re'].value_counts()
考虑到槽值计算仅靠正则表达式效果不会特别好,因此该任务点到为止。使用BERT来提高比赛分数更稳妥。
-
任务5:BERT模型入门
- 学习transformers库中pipline和加载模型的过程
- 学习transformers库的使用:包括定义数据集,定义模型和训练模型
- 学习资料:
# 选用Bert模型 可参考huggingface
from transformers import AutoTokenizer, AutoModelForMaskedLM
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
任务5,6,7都是使用BERT模型,因此将其综合。
-
任务6:BERT文本分类
- 步骤1使用BERT完成意图识别(文本分类)
- 步骤2:将步骤1预测的结果文件提交到比赛,截图分数;
- 学习资料:
-
任务7:BERT实体抽取
- 步骤1:使用BERT完成实体抽取(槽位识别)
- 步骤2:将步骤1预测的结果文件提交到比赛,截图分数;
- 学习资料:
from transformers import BertTokenizer,BertModel,BertConfig,AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import logging
logging.set_verbosity_error()
import pandas as pd
import numpy as np
labels = {'open_ac':0,
'close_ac':1,
'open_ac_mode':2,
'adjust_ac_temperature_to_number':3,
'raise_ac_temperature_little':4,
'lower_ac_temperature_little':5,
'adjust_ac_windspeed_to_number':6,
'open_car_device':7,
'close_car_device':8,
'map_control_confirm':9,
'navigate_poi':10,
'navigate_landmark_poi':11,
'view_trans':12,
'music_search_artist_song':13,
'collect_music':14,
'open_collect_music':15,
'play_collect_music':16,
}
label = [labels[label] for label in data['意图']]
x = list(data['原始文本'])
y = label
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encoding = tokenizer(x_train, truncation=True, padding=True, max_length=64)
test_encoding = tokenizer(x_test, truncation=True, padding=True, max_length=64)
# 数据集读取, 继承torch的Dataset类,方便后面用DataLoader封装数据集
class NewsDataset(Dataset):
def __init__(self, encodings, labels):
self.encodings = encodings
self.labels = labels
#这里的idx是为了让后面的DataLoader成批处理成迭代器,按idx映射到对应数据
def __getitem__(self, idx):
item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
item['labels'] = torch.tensor(int(self.labels[idx]))
return item
#数据集长度。通过len(这个实例对象),可以查看长度
def __len__(self):
return len(self.labels)
#将数据集包装成torch的Dataset形式
train_dataset = NewsDataset(train_encoding, y_train)
test_dataset = NewsDataset(test_encoding, y_test)
# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)
定义模型:
class my_bert_model(nn.Module):
def __init__(self, freeze_bert=False, hidden_size=768):
super().__init__()
config = BertConfig.from_pretrained('bert-base-uncased')
config.update({'output_hidden_states':True})
self.bert = BertModel.from_pretrained("bert-base-uncased",config=config)
self.fc = nn.Linear(hidden_size*4, 17)
#是否冻结bert,不让其参数更新
if freeze_bert:
for p in self.bert.parameters():
p.requires_grad = False
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids, attention_mask)
all_hidden_states = torch.stack(outputs[2]) #因为输出的是所有层的输出,是元组保存的,所以转成矩阵
concat_last_4layers = torch.cat((all_hidden_states[-1], #取最后4层的输出
all_hidden_states[-2],
all_hidden_states[-3],
all_hidden_states[-4]), dim=-1)
cls_concat = concat_last_4layers[:,0,:] #取 [CLS] 这个token对应的经过最后4层concat后的输出
result = self.fc(cls_concat)
return result
# 训练函数
def train():
model.train()
total_train_loss = 0
iter_num = 0
total_iter = len(train_loader)
for batch in train_loader:
# 正向传播
optim.zero_grad()
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
loss = criterion(outputs, labels)
total_train_loss += loss.item()
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), 1.0) #梯度裁剪,防止梯度爆炸
# 参数更新
optim.step()
scheduler.step()
iter_num += 1
if(iter_num % 100==0):
print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
# 精度计算
def flat_accuracy(preds, labels):
pred_flat = np.argmax(preds, axis=1).flatten()
labels_flat = labels.flatten()
return np.sum(pred_flat == labels_flat) / len(labels_flat)
def validation():
model.eval()
total_eval_accuracy = 0
total_eval_loss = 0
with torch.no_grad():
for batch in test_dataloader:
# 正常传播
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['labels'].to(device)
outputs = model(input_ids, attention_mask=attention_mask)
loss = criterion(outputs, labels)
logits = outputs
total_eval_loss += loss.item()
logits = logits.detach().cpu().numpy()
label_ids = labels.to('cpu').numpy()
total_eval_accuracy += flat_accuracy(logits, label_ids)
avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
print("Accuracy: %.4f" % (avg_val_accuracy))
print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
print("-------------------------------")
for epoch in range(1):
print("------------Epoch: %d ----------------" % epoch)
train()
validation()
最后结果准确率还是很低,后续有时间再研究如何调整把