汽车领域多语种迁移学习挑战赛(Coggle 30 Days of ML)

本次跟着Coggle 30 Days of ML学习算法竞赛,而不是基于现成baseline来操作,预计重新熟悉并学习学习文本分类和关键词提取。

数据挖掘比赛为“汽车领域多语种迁移学习挑战赛”,比赛报名地址:2022 iFLYTEK A.I.开发者大赛-讯飞开放平台

以下为打卡内容:

  • 任务1:报名比赛
    • 步骤1:报名比赛2022 iFLYTEK A.I.开发者大赛-讯飞开放平台
    • 步骤2:下载比赛数据(点击比赛页面的赛题数据)
    • 步骤3:解压比赛数据,并使用pandas进行读取;
    • 步骤4:查看训练集和测试集字段类型,并将数据读取代码写到博客;
import pandas as pd
cn_train = pd.read_csv('中文_trian.csv')
jp_train = pd.read_csv('日语_train.csv')
en_train = pd.read_csv('英文_train.csv')
jp_test = pd.read_excel('testA.xlsx',sheet_name = '日语_testA')
en_test = pd.read_excel('testA.xlsx',sheet_name = '英文_testA')

jp_train = jp_train.iloc[: ,0:5]
  • 任务2:文本分析与文本分词
    • 步骤1:使用jieba对中文进行分词;
    • 步骤2:使用negisa对日语进行分词;
# 分词
import jieba
import nagisa # 日文分词

def cutwords(sentence):
    result = jieba.lcut(sentence)
    return result

cn_train['words'] = cn_train['原始文本'].apply(lambda x:' '.join(cutwords(x)))
jp_train['words'] = jp_train['原始文本'].apply(lambda x: ' '.join(nagisa.tagging(x).words))
en_train['words'] = en_train['原始文本'].apply(lambda x:x.lower())

jp_test['words'] = jp_test['原始文本'].apply(lambda x: ' '.join(nagisa.tagging(x).words))
en_test['words'] = en_test['原始文本'].apply(lambda x:x.lower())
  • 任务3:TFIDF与文本分类

    • 步骤1:学习TFIDF的使用,提取语料的TFIDF特征;
    • 步骤2:使用逻辑回归结合TFIDF进行训练(所有的语言语料),并对测试集的意图进行分类;
    • 步骤3:将步骤2预测的结果文件提交到比赛,截图分数;
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

#划分验证集训练集
data = jp_train.append(en_train,ignore_index=True)
y = data['意图'].values
xtrain, xvalid, ytrain, yvalid = train_test_split(data.words.values, y, 
                                                  stratify=y, 
                                                  random_state=42, 
                                                  test_size=0.2, shuffle=True)
print(xtrain.shape,xvalid.shape,ytrain.shape,yvalid.shape)

#提取TF-IDF特征
tfv = TfidfVectorizer(use_idf = True)
model = tfv.fit(jp_train['words'].tolist()+en_train['words'].tolist())
xtrain_tfv =  tfv.transform(jp + en) 
xvalid_tfv = tfv.transform(jp_valid + en_valid)

#LR模型
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='acc')
clf.fit(xtrain_tfv)

predictions = clf.predict(xvalid_tfv)
from sklearn.metrics import accuracy_score
accuracy_score(yvalid, predictions)  #验证集 准确率90%

#预测
jptest_tfv = tfv.transform(jp_test.words.values)
entest_tfv = tfv.transform(en_test.words.values)

jp_predictions = clf.predict(jptest_tfv)
en_predictions = clf.predict(entest_tfv)

import numpy as np
jp_submit = jp_test
jp_submit['意图']=jp_predictions
jp_submit['槽值1']=np.nan
jp_submit['槽值2']=np.nan
jp_submit.drop(['words'],axis = 1,inplace =True)

en_submit = en_test
en_submit['意图']=en_predictions
en_submit['槽值1']=np.nan
en_submit['槽值2']=np.nan
en_submit.drop(['words'],axis = 1,inplace =True)

# 写入提交文件
writer = pd.ExcelWriter('yuliao_submit.xlsx')
en_submit.to_excel(writer, sheet_name='英文_testA', index=None)
jp_submit.to_excel(writer, sheet_name='日语_testA', index=None)
writer.save()
writer.close()

 分数只有0.59,一方面TD-IDF在这么小的数据集上效果还是没那么好,另一方面结果的槽值1,2都没有填写。

  • 任务4:正则表达式

    • 步骤1:学习使用正则表达式,并提取文本中的连续数值;
    • 步骤2:使用正则表达式进行槽值匹配(基于历史的槽值字符串)
import re

#匹配连续出现的任意字符
def get_lianxu(word):
    return re.findall("(.)\\1{1}", word)

jp_train['re'] = jp_train['原始文本'].apply(get_lianxu)
jp_train['re'].value_counts()

考虑到槽值计算仅靠正则表达式效果不会特别好,因此该任务点到为止。使用BERT来提高比赛分数更稳妥。

  • 任务5:BERT模型入门

    • 学习transformers库中pipline和加载模型的过程
    • 学习transformers库的使用:包括定义数据集,定义模型和训练模型
    • 学习资料:
# 选用Bert模型 可参考huggingface
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")

 任务5,6,7都是使用BERT模型,因此将其综合。

from transformers import BertTokenizer,BertModel,BertConfig,AdamW, get_linear_schedule_with_warmup
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, TensorDataset
from transformers import logging
logging.set_verbosity_error()
import pandas as pd
import numpy as np

labels = {'open_ac':0,
          'close_ac':1,
          'open_ac_mode':2,
          'adjust_ac_temperature_to_number':3,
          'raise_ac_temperature_little':4,
          'lower_ac_temperature_little':5,
          'adjust_ac_windspeed_to_number':6,
          'open_car_device':7,
          'close_car_device':8,
          'map_control_confirm':9,
          'navigate_poi':10,
          'navigate_landmark_poi':11,
          'view_trans':12,
          'music_search_artist_song':13,
          'collect_music':14,
          'open_collect_music':15,
          'play_collect_music':16,
          
          }
label = [labels[label] for label in data['意图']]
x = list(data['原始文本'])
y = label
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test =  train_test_split(x, y, test_size=0.2)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encoding = tokenizer(x_train, truncation=True, padding=True, max_length=64)
test_encoding = tokenizer(x_test, truncation=True, padding=True, max_length=64)

# 数据集读取, 继承torch的Dataset类,方便后面用DataLoader封装数据集
class NewsDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    #这里的idx是为了让后面的DataLoader成批处理成迭代器,按idx映射到对应数据
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(int(self.labels[idx]))
        return item
    
    #数据集长度。通过len(这个实例对象),可以查看长度
    def __len__(self):
        return len(self.labels)
#将数据集包装成torch的Dataset形式
train_dataset = NewsDataset(train_encoding, y_train)
test_dataset = NewsDataset(test_encoding, y_test)
# 单个读取到批量读取
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=True)

定义模型:

class my_bert_model(nn.Module):
    def __init__(self, freeze_bert=False, hidden_size=768):
        super().__init__()
        config = BertConfig.from_pretrained('bert-base-uncased')
        config.update({'output_hidden_states':True})
        self.bert = BertModel.from_pretrained("bert-base-uncased",config=config)
        self.fc = nn.Linear(hidden_size*4, 17)
        
        #是否冻结bert,不让其参数更新
        if freeze_bert:
            for p in self.bert.parameters():
                p.requires_grad = False
    
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids, attention_mask)
        all_hidden_states = torch.stack(outputs[2])  #因为输出的是所有层的输出,是元组保存的,所以转成矩阵
        concat_last_4layers = torch.cat((all_hidden_states[-1],   #取最后4层的输出
                                         all_hidden_states[-2], 
                                         all_hidden_states[-3], 
                                         all_hidden_states[-4]), dim=-1)
        
        cls_concat = concat_last_4layers[:,0,:]   #取 [CLS] 这个token对应的经过最后4层concat后的输出
        result = self.fc(cls_concat)
        
        return result
     
# 训练函数
def train():
    model.train()
    total_train_loss = 0
    iter_num = 0
    total_iter = len(train_loader)
    for batch in train_loader:
        # 正向传播
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        loss = criterion(outputs, labels)                
        total_train_loss += loss.item()
           
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)   #梯度裁剪,防止梯度爆炸
        
        # 参数更新
        optim.step()
        scheduler.step()

        iter_num += 1
        if(iter_num % 100==0):
            print("epoth: %d, iter_num: %d, loss: %.4f, %.2f%%" % (epoch, iter_num, loss.item(), iter_num/total_iter*100))
        
    print("Epoch: %d, Average training loss: %.4f"%(epoch, total_train_loss/len(train_loader)))
# 精度计算
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
    
def validation():
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    with torch.no_grad():
        for batch in test_dataloader:
            # 正常传播
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            outputs = model(input_ids, attention_mask=attention_mask)
        
            loss = criterion(outputs, labels)
            logits = outputs

            total_eval_loss += loss.item()
            logits = logits.detach().cpu().numpy()
            label_ids = labels.to('cpu').numpy()
            total_eval_accuracy += flat_accuracy(logits, label_ids)
        
    avg_val_accuracy = total_eval_accuracy / len(test_dataloader)
    print("Accuracy: %.4f" % (avg_val_accuracy))
    print("Average testing loss: %.4f"%(total_eval_loss/len(test_dataloader)))
    print("-------------------------------")
for epoch in range(1):
    print("------------Epoch: %d ----------------" % epoch)
    train()
    validation()

最后结果准确率还是很低,后续有时间再研究如何调整把

  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

数据求学家

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值