小黑fastNLP实战:实体识别1

1.数据读取

import os
data_dir = './data/atis/'
train_dir = os.path.join(data_dir,'train')
test_dir = os.path.join(data_dir,'test')
dev_dir = os.path.join(data_dir,'dev')
# 定义数据集text与labels的路径
train_text_path = os.path.join(train_dir,'seq.in') 
train_label_path = os.path.join(train_dir,'seq.out')
dev_text_path = os.path.join(dev_dir,'seq.in') 
dev_label_path = os.path.join(dev_dir,'seq.out')
test_text_path = os.path.join(test_dir,'seq.in') 
test_label_path = os.path.join(test_dir,'seq.out')
# 读取路径
def read_text(file_name):
    lines = [line.strip().split() for line in open(file_name,'r',encoding = 'utf-8')]
    return lines
train_text = read_text(train_text_path)
train_label = read_text(train_label_path)
dev_text = read_text(dev_text_path)
dev_label = read_text(dev_label_path)
test_text = read_text(test_text_path)
test_label = read_text(test_label_path)

2.数据集构建

from fastNLP import DataSet,Instance
from fastNLP.io import DataBundle
def from_array_to_dataset(texts,labels):
    ds = DataSet()
    for text,label in zip(texts,labels):
        assert len(text) == len(label)
        ins = Instance(text = text,target = label)
        ds.append(ins)
    ds.add_seq_len('text')
    return ds
train_ds = from_array_to_dataset(train_text,train_label)
test_ds = from_array_to_dataset(test_text,test_label)
dev_ds = from_array_to_dataset(dev_text,dev_label)
data_bundle = DataBundle()
data_bundle.set_dataset(name = 'train',dataset = train_ds)
data_bundle.set_dataset(name = 'test',dataset = test_ds)
data_bundle.set_dataset(name = 'dev',dataset = dev_ds)
print(data_bundle.datasets)

{‘train’: ±-------------------------±-------------------------±--------+
| text | target | seq_len |
±-------------------------±-------------------------±--------+
| [‘i’, ‘want’, ‘to’, '… | [‘O’, ‘O’, ‘O’, ‘O’, … | 10 |
| [‘round’, ‘trip’, 'fa… | [‘B-round_trip’, 'I-r… | 33 |
| [‘show’, ‘me’, ‘the’,… | [‘O’, ‘O’, ‘O’, ‘O’, … | 10 |
| [‘what’, ‘are’, ‘the’… | [‘O’, ‘O’, ‘O’, ‘O’, … | 19 |
| [‘which’, ‘airlines’,… | [‘O’, ‘O’, ‘O’, ‘O’, … | 11 |
| [“i’m”, ‘looking’, 'f… | [‘O’, ‘O’, ‘O’, ‘O’, … | 25 |
| [‘okay’, ‘and’, 'then… | [‘O’, ‘O’, ‘O’, ‘O’, … | 14 |
| [‘show’, ‘me’, ‘all’,… | [‘O’, ‘O’, ‘O’, ‘O’, … | 9 |
| [‘okay’, “i’d”, 'like… | [‘O’, ‘O’, ‘O’, ‘O’, … | 18 |
| [‘on’, ‘tuesday’, 'wh… | [‘O’, 'B-depart_date… | 14 |
| [‘american’, 'flights… | [‘B-airline_name’, 'O… | 8 |
| [‘what’, ‘types’, 'of… | [‘O’, ‘O’, ‘O’, ‘O’, … | 11 |
| [‘in’, ‘the’, ‘next’,… | [‘O’, ‘O’, 'B-depart_… | 17 |
| [‘does’, 'continental… | [‘O’, 'B-airline_name… | 9 |
| [‘chicago’, ‘to’, 'mi… | ['B-fromloc.city_name… | 3 |
| [‘how’, ‘many’, 'flig… | [‘O’, ‘O’, ‘O’, ‘O’, … | 9 |
| [‘show’, ‘me’, ‘the’,… | [‘O’, ‘O’, ‘O’, ‘O’, … | 11 |
| [“i’d”, ‘like’, ‘to’,… | [‘O’, ‘O’, ‘O’, ‘O’, … | 14 |
| [‘how’, ‘many’, 'book… | [‘O’, ‘O’, ‘O’, ‘O’, … | 6 |
| [‘what’, ‘are’, ‘the’… | [‘O’, ‘O’, ‘O’, ‘O’, … | 13 |
| [‘what’, ‘flights’, '… | [‘O’, ‘O’, ‘O’, 'B-fr… | 14 |
| [‘please’, ‘list’, 'a… | [‘O’, ‘O’, ‘O’, ‘O’, … | 12 |
| [‘what’, ‘time’, 'zon… | [‘O’, ‘O’, ‘O’, ‘O’, … | 6 |
| [‘show’, ‘me’, 'groun… | [‘O’, ‘O’, ‘O’, ‘O’, … | 9 |
| [‘i’, ‘want’, ‘to’, '… | [‘O’, ‘O’, ‘O’, ‘O’, … | 15 |
| [‘from’, ‘seattle’, '… | [‘O’, 'B-fromloc.city… | 6 |
| [‘can’, ‘you’, ‘show’… | [‘O’, ‘O’, ‘O’, ‘O’, … | 13 |
| [‘what’, ‘flights’, '… | [‘O’, ‘O’, ‘O’, ‘O’, … | 5 |
| [‘what’, ‘are’, ‘the’… | [‘O’, ‘O’, ‘O’, ‘O’, … | 8 |
| [‘show’, ‘flights’, '… | [‘O’, ‘O’, ‘O’, 'B-fr… | 13 |
| [‘please’, ‘give’, 'm… | [‘O’, ‘O’, ‘O’, ‘O’, … | 16 |
| … | … | … |
±-------------------------±-------------------------±--------+, ‘test’: ±-------------------------±-------------------------±--------+
| text | target | seq_len |
±-------------------------±-------------------------±--------+
| [‘i’, ‘would’, ‘like’… | [‘O’, ‘O’, ‘O’, ‘O’, … | 19 |
| [‘on’, ‘april’, 'firs… | [‘O’, 'B-depart_date… | 16 |
| [‘on’, ‘april’, 'firs… | [‘O’, 'B-depart_date… | 13 |
| [‘i’, ‘would’, ‘like’… | [‘O’, ‘O’, ‘O’, ‘O’, … | 16 |
| [‘i’, ‘would’, ‘like’… | [‘O’, ‘O’, ‘O’, ‘O’, … | 17 |
| [‘i’, ‘need’, ‘a’, 'f… | [‘O’, ‘O’, ‘O’, ‘O’, … | 16 |
| [‘monday’, ‘morning’,… | ['B-depart_date.day_n… | 11 |
| [‘on’, ‘wednesday’, '… | [‘O’, 'B-depart_date… | 17 |
| [‘after’, ‘12’, ‘pm’,… | ['B-depart_time.time_… | 17 |
| [‘are’, ‘there’, 'any… | [‘O’, ‘O’, ‘O’, ‘O’, … | 13 |
| [‘find’, ‘a’, 'flight… | [‘O’, ‘O’, ‘O’, ‘O’, … | 8 |
| [‘on’, ‘next’, 'wedne… | [‘O’, 'B-depart_date… | 20 |
| [‘show’, ‘flight’, 'a… | [‘O’, ‘O’, ‘O’, ‘O’, … | 17 |
| [‘flight’, ‘on’, 'ame… | [‘O’, ‘O’, 'B-airline… | 13 |
| [‘find’, ‘flights’, '… | [‘O’, ‘O’, ‘O’, 'B-to… | 8 |
| [‘find’, ‘nonstop’, '… | [‘O’, ‘B-flight_stop’… | 14 |
| [‘show’, ‘flights’, '… | [‘O’, ‘O’, ‘O’, 'B-fr… | 8 |
| [‘show’, ‘flights’, '… | [‘O’, ‘O’, 'B-depart_… | 9 |
| [‘show’, ‘flights’, '… | [‘O’, ‘O’, 'B-depart_… | 9 |
| [‘show’, ‘flights’, '… | [‘O’, ‘O’, ‘O’, 'B-fr… | 9 |
| [‘show’, ‘flights’, '… | [‘O’, ‘O’, ‘O’, 'B-fr… | 8 |
| [‘show’, ‘flights’, '… | [‘O’, ‘O’, 'B-depart_… | 9 |
| [‘show’, ‘flights’, '… | [‘O’, ‘O’, 'B-depart_… | 9 |
| [‘which’, ‘flights’, … | [‘O’, ‘O’, ‘O’, ‘O’, … | 9 |
| [‘what’, ‘flights’, '… | [‘O’, ‘O’, ‘O’, ‘O’, … | 9 |
| [‘which’, ‘flights’, … | [‘O’, ‘O’, ‘O’, ‘O’, … | 12 |
| [‘which’, ‘flights’, … | [‘O’, ‘O’, ‘O’, ‘O’, … | 20 |
| [‘which’, ‘flights’, … | [‘O’, ‘O’, ‘O’, 'B-ai… | 10 |
| [‘which’, ‘flights’, … | [‘O’, ‘O’, ‘O’, ‘O’, … | 10 |
| [‘which’, ‘flights’, … | [‘O’, ‘O’, ‘O’, ‘O’, … | 11 |
| [‘which’, ‘flights’, … | [‘O’, ‘O’, ‘O’, ‘O’, … | 10 |
| … | … | … |
±-------------------------±-------------------------±--------+, ‘dev’: ±-------------------------±-------------------------±--------+
| text | target | seq_len |
±-------------------------±-------------------------±--------+
| [‘i’, ‘want’, ‘to’, '… | [‘O’, ‘O’, ‘O’, ‘O’, … | 18 |
| [‘show’, ‘me’, ‘all’,… | [‘O’, ‘O’, ‘O’, 'B-ro… | 11 |
| [‘i’, ‘would’, ‘like’… | [‘O’, ‘O’, ‘O’, ‘O’, … | 16 |
| [‘what’, ‘are’, ‘the’… | [‘O’, ‘O’, ‘O’, 'B-cl… | 16 |
| [“i’m”, ‘flying’, 'fr… | [‘O’, ‘O’, ‘O’, 'B-fr… | 8 |
| [‘okay’, ‘can’, ‘you’… | [‘O’, ‘O’, ‘O’, ‘O’, … | 12 |
| [‘from’, ‘montreal’, … | [‘O’, 'B-fromloc.city… | 5 |
| [‘what’, ‘is’, ‘the’,… | [‘O’, ‘O’, ‘O’, 'B-fl… | 11 |
| [‘flights’, ‘from’, '… | [‘O’, ‘O’, 'B-fromloc… | 11 |
| [‘what’, ‘is’, ‘the’,… | [‘O’, ‘O’, ‘O’, 'B-fl… | 11 |
| [‘flights’, ‘from’, '… | [‘O’, ‘O’, 'B-fromloc… | 5 |
| [‘i’, ‘would’, ‘like’… | [‘O’, ‘O’, ‘O’, ‘O’, … | 16 |
| [‘okay’, ‘that’, 'sou… | [‘O’, ‘O’, ‘O’, ‘O’, … | 22 |
| [‘show’, ‘me’, ‘the’,… | [‘O’, ‘O’, ‘O’, ‘O’, … | 12 |
| [‘flights’, ‘from’, '… | [‘O’, ‘O’, 'B-fromloc… | 10 |
| [“i’m”, ‘interested’,… | [‘O’, ‘O’, ‘O’, ‘O’, … | 9 |
| [‘i’, ‘am’, 'interest… | [‘O’, 'B-depart_time… | 27 |
| [“i’m”, ‘looking’, 'f… | [‘O’, ‘O’, ‘O’, ‘O’, … | 16 |
| [“what’s”, 'restricti… | [‘O’, ‘O’, 'B-restric… | 3 |
| [‘what’, ‘types’, 'of… | [‘O’, ‘O’, ‘O’, ‘O’, … | 9 |
| [‘what’, ‘does’, 'the… | [‘O’, ‘O’, ‘O’, ‘O’, … | 6 |
| [‘a’, ‘first’, 'class… | [‘O’, ‘B-class_type’,… | 13 |
| [‘please’, ‘list’, 't… | [‘O’, ‘O’, ‘O’, ‘O’, … | 11 |
| [‘what’, ‘flights’, '… | [‘O’, ‘O’, ‘O’, ‘O’, … | 10 |
| [‘on’, ‘united’, 'air… | [‘O’, 'B-airline_name… | 18 |
| [‘i’, ‘need’, ‘a’, 'f… | [‘O’, ‘O’, ‘O’, ‘O’, … | 14 |
| [‘what’, ‘are’, ‘the’… | [‘O’, ‘O’, ‘O’, 'B-co… | 11 |
| [‘does’, 'continental… | [‘O’, 'B-airline_name… | 8 |
| [‘i’, ‘would’, ‘like’… | [‘O’, ‘O’, ‘O’, ‘O’, … | 15 |
| [‘on’, ‘continental’,… | [‘O’, 'B-airline_name… | 11 |
| [‘find’, ‘me’, ‘the’,… | [‘O’, ‘O’, ‘O’, 'B-co… | 9 |
| … | … | … |
±-------------------------±-------------------------±--------+}
3.建立词典

from fastNLP import Vocabulary
vocab = Vocabulary()
vocab.from_dataset(
    data_bundle.get_dataset('train'),
    field_name = 'text',
    no_create_entry_dataset = [data_bundle.get_dataset('dev'),data_bundle.get_dataset('test')]
)
vocab.index_dataset(data_bundle.get_dataset('train'),data_bundle.get_dataset('test'),data_bundle.get_dataset('dev'),field_name = 'text')
target_vocab = Vocabulary(unknown=None)
target_vocab.from_dataset(data_bundle.get_dataset('train'),data_bundle.get_dataset('test'),data_bundle.get_dataset('dev'),field_name = 'target')
target_vocab.index_dataset(data_bundle.get_dataset('train'),data_bundle.get_dataset('dev'),data_bundle.get_dataset('test'),field_name = 'target')
data_bundle.set_vocab(field_name = 'text',vocab = vocab)
data_bundle.set_vocab(field_name = 'target',vocab = target_vocab)
data_bundle.set_target('target')
data_bundle.set_target('seq_len')
data_bundle.set_input('text')
data_bundle.set_input('seq_len')
data_bundle.set_input('target')
data_bundle.rename_field('text','words')

In total 3 datasets:
train has 4478 instances.
test has 893 instances.
dev has 500 instances.
In total 2 vocabs:
target has 128 entries.
words has 952 entries.
4.模型训练

from fastNLP import SpanFPreRecMetric,Trainer
from torch.optim import Adam
from fastNLP import LossInForward
import torch
from fastNLP.embeddings import BertEmbedding
embed = BertEmbedding(vocab = data_bundle.get_vocab('words'),model_dir_or_name = 'en')
model = BiLSTMCRF(embed = embed,num_classes = len(data_bundle.get_vocab('target')),num_layers = 1,hidden_size = 200,dropout = 0.5,target_vocab = data_bundle.get_vocab('target'))
metric = SpanFPreRecMetric(tag_vocab = data_bundle.get_vocab('target'))
optimizer = Adam(model.parameters(),lr = 2e-5)
loss = LossInForward()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
trainer = Trainer(data_bundle.get_dataset('train'),model,loss = loss,optimizer = optimizer,batch_size = 8,dev_data = data_bundle.get_dataset('dev'),metrics = metric,device = device)
trainer.train()

loading vocabulary file C:\Users\xiaoh.fastNLP\embedding\bert-base-cased\vocab.txt
Load pre-trained BERT parameters from file C:\Users\xiaoh.fastNLP\embedding\bert-base-cased\pytorch_model.bin.
Bert Model will return 1 layers (layer-0 is embedding result): [-1]
input fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 33])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])
words: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 33])
target fields after batch(if batch size is 2):
target: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2, 33])
seq_len: (1)type:torch.Tensor (2)dtype:torch.int64, (3)shape:torch.Size([2])

training epochs started 2021-12-28-15-27-52-544078
Evaluate data in 1.47 seconds!
Evaluation on dev at Epoch 1/10. Step:560/5600:
SpanFPreRecMetric: f=0.76697, pre=0.770366, rec=0.763604

Evaluate data in 1.7 seconds!
Evaluation on dev at Epoch 2/10. Step:1120/5600:
SpanFPreRecMetric: f=0.888173, pre=0.888694, rec=0.887654

Evaluate data in 1.4 seconds!
Evaluation on dev at Epoch 3/10. Step:1680/5600:
SpanFPreRecMetric: f=0.923122, pre=0.922313, rec=0.923932

Evaluate data in 1.52 seconds!
Evaluation on dev at Epoch 4/10. Step:2240/5600:
SpanFPreRecMetric: f=0.943231, pre=0.938586, rec=0.947923

Evaluate data in 1.55 seconds!
Evaluation on dev at Epoch 5/10. Step:2800/5600:
SpanFPreRecMetric: f=0.954241, pre=0.950639, rec=0.95787

Evaluate data in 1.63 seconds!
Evaluation on dev at Epoch 6/10. Step:3360/5600:
SpanFPreRecMetric: f=0.966384, pre=0.965537, rec=0.967232

Evaluate data in 1.59 seconds!
Evaluation on dev at Epoch 7/10. Step:3920/5600:
SpanFPreRecMetric: f=0.972499, pre=0.972499, rec=0.972499

Evaluate data in 1.5 seconds!
Evaluation on dev at Epoch 8/10. Step:4480/5600:
SpanFPreRecMetric: f=0.972579, pre=0.96975, rec=0.975424

Evaluate data in 1.52 seconds!
Evaluation on dev at Epoch 9/10. Step:5040/5600:
SpanFPreRecMetric: f=0.973415, pre=0.971995, rec=0.974839

Evaluate data in 1.51 seconds!
Evaluation on dev at Epoch 10/10. Step:5600/5600:
SpanFPreRecMetric: f=0.977531, pre=0.974971, rec=0.980105

Reloaded the best model.

In Epoch:10/Step:5600, got best dev performance:
SpanFPreRecMetric: f=0.977531, pre=0.974971, rec=0.980105
{‘best_eval’: {‘SpanFPreRecMetric’: {‘f’: 0.977531,
‘pre’: 0.974971,
‘rec’: 0.980105}},
‘best_epoch’: 10,
‘best_step’: 5600,
‘seconds’: 505.73}

  • 0
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 7
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 7
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值