【NLP】BiLSTM 命名实体识别 手写代码

【参考:pytorch_BiLSTM 命名实体识别 手写代码_哔哩哔哩_bilibili

【参考:shouxieai/nlp-bilstm_crf-ner: nlp-bilstm+crf-ner

数据样例

高 B-NAME
勇 E-NAME
: O
男 O
, O
中 B-CONT
国 M-CONT
国 M-CONT
籍 E-CONT
, O
无 O
境 O
外 O
居 O
留 O
权 O
, O
"""
2022/4/22
"""
import os

import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
from tqdm import tqdm


def build_corpus(data_type, make_vocab=True, data_dir="data"):
    word_lists, tag_lists = [], []
    with open(os.path.join(data_dir, data_type + '.char.bmes'), 'r', encoding='utf-8') as f:
        word_list, tag_list = [], []
        for line in f:
            if line != '\n':  # 数据中每句话会使用一个换行隔开
                word, tag = line.strip().split(" ")
                word_list.append(word)
                tag_list.append(tag)
            else:
                word_lists.append(word_list)  # 添加每句话
                tag_lists.append(tag_list)
                word_list, tag_list = [], []  # 清空

    word_lists = sorted(word_lists, key=lambda x: len(x), reverse=False)
    tag_lists = sorted(tag_lists, key=lambda x: len(x), reverse=False)

    # 如果make_vocab为True,还需要返回word2id和tag2id
    if make_vocab:
        word2id = build_map(word_lists)
        tag2id = build_map(tag_lists)
        word2id['<UNK>'] = len(word2id)
        word2id['<PAD>'] = len(word2id)
        tag2id['<PAD>'] = len(tag2id)
        return word_lists, tag_lists, word2id, tag2id

    return word_lists, tag_lists


def build_map(lists):
    """
    :param lists: 二维矩阵
    :return: 字典map
    """
    maps = {}
    for list in lists:
        for e in list:
            if e not in maps:
                maps[e] = len(maps)  # 编号自动递增
    return maps


class MyDataset(Dataset):
    def __init__(self, datas, tags, word2idx, tag2idx):
        self.datas = datas
        self.tags = tags
        self.word2idx = word2idx
        self.tag2idx = tag2idx

    def __getitem__(self, index):
        data = self.datas[index]
        tag = self.tags[index]
        # 如果word2idx中没有该词就用UNK代替
        data_index = [self.word2idx.get(i, self.word2idx["<UNK>"]) for i in data]  # 获取每句话所有字的索引
        tag_index = [self.tag2idx[i] for i in tag]

        return data_index, tag_index

    def __len__(self):
        return len(self.datas)

    def pro_batch_data(self, batch_datas):
        # 因为每句话的长度不一样,拼接数据维度不一致会导致错误,所以自己手动拼接 未完全理解
        datas = []
        tags = []
        batch_lens = []

        for data, tag in batch_datas:
            datas.append(data)
            tags.append(tag)
            batch_lens.append(len(data))
        batch_max_len = max(batch_lens)

        # 填充至相同长度
        datas = [i + [self.word2idx['<PAD>']] * (batch_max_len - len(i)) for i in datas]  # i是,每句话
        tags = [i + [self.tag2idx['<PAD>']] * (batch_max_len - len(i)) for i in tags]

        # return torch.IntTensor(datas), torch.LongTensor(tags)
        return torch.tensor(datas, dtype=torch.int64), torch.tensor(tags, dtype=torch.long)


class Mymodel(nn.Module):
    def __init__(self, corpus_num, embedding_dim, hidden_num, class_num, bi=True):
        super(Mymodel, self).__init__()
        self.embedding = nn.Embedding(corpus_num, embedding_dim)
        self.lstm = nn.LSTM(input_size=embedding_dim,
                            hidden_size=hidden_num,
                            batch_first=True,
                            bidirectional=bi)
        if bi:
            self.classifier = nn.Linear(hidden_num * 2, class_num)
        else:
            self.classifier = nn.Linear(hidden_num, class_num)

    def forward(self, batch_data):
        x = self.embedding(batch_data)
        out, _ = self.lstm(x)
        predict = self.classifier(out)
        return predict


if __name__ == "__main__":
    # 训练集
    train_data, train_tag, word2id, tag2id = build_corpus('train')
    id2tag = [i for i in tag2id]
    # 验证集
    dev_data, dev_tag = build_corpus('dev', make_vocab=False)
    corpus_num = len(train_data)
    class_num = len(train_tag)

    epochs = 20
    train_batch_size = 30
    dev_batch_size = 30
    embedding_dim = 100
    hidden_num = 107
    bi = True
    lr = 0.001

    train_dataset = MyDataset(train_data, train_tag, word2id, tag2id)

    train_dataloader = DataLoader(train_dataset, batch_size=train_batch_size, shuffle=False
                                  , collate_fn=train_dataset.pro_batch_data)

    dev_dataset = MyDataset(dev_data, dev_tag, word2id, tag2id)

    dev_dataloader = DataLoader(dev_dataset, batch_size=dev_batch_size, shuffle=False
                                , collate_fn=dev_dataset.pro_batch_data)

    model = Mymodel(corpus_num, embedding_dim, hidden_num, class_num, bi)
    criterion = nn.CrossEntropyLoss()
    optim = optim.Adam(model.parameters(), lr=lr)

    if os.path.exists('model.pth') is False:

        for i in range(epochs):
            model.train()
            train_loss = 0
            for train_data, train_tag in tqdm(train_dataloader):
                optim.zero_grad()
                predict = model.forward(train_data)
                # predict (5,4,3821) train_tag(5,4)
                predict = predict.reshape(-1, predict.shape[-1])  # (20,3821)
                train_tag = train_tag.reshape(-1)  # (20)
                loss = criterion(predict, train_tag)
                loss.backward()
                optim.step()

                train_loss += loss.item() / predict.size(0)

            train_loss = train_loss / len(train_dataloader.dataset)  # 计算平均loss
            print('Epoch: {} \tTraining Loss: {:.6f}'.format(i, train_loss))
        # 保存模型参数 【参考:[pytorch保存模型的两种方式_SCU-JJkinging的博客-CSDN博客](https://blog.csdn.net/qq_41845478/article/details/116023691)】
        torch.save({'model': model.state_dict()}, 'model.pth')
    else:
        # 读取模型
        # model = Mymodel()
        state_dict = torch.load('model.pth')
        model.load_state_dict(state_dict['model'])

    eval=True
    if eval:
    # 验证
        model.eval()
        with torch.no_grad():
            dev_loss = 0
            real_label = []
            predict_label = []
            for dev_data, dev_tag in tqdm(dev_dataloader):
                predict = model.forward(dev_data)
                # predict (5,4,3821) dev_tag(5,4)
                predict = predict.reshape(-1, predict.shape[-1])  # (20,3821)
                predcit_class = torch.argmax(predict, dim=1)
                predict_label.append(predcit_class.numpy())

                dev_tag = dev_tag.reshape(-1)  # (20,)
                real_label.append(dev_tag.numpy())

                loss = criterion(predict, dev_tag)
                dev_loss += loss.item() / predict.size(0)


            real_label = np.concatenate(real_label)
            predict_label = np.concatenate(predict_label)

            acc = np.sum(real_label == predict_label) / len(predict_label)
            dev_loss = dev_loss / len(dev_dataloader.dataset)  # 计算平均loss
            print(f'dev_loss:{dev_loss},acc:{acc}')
            # dev_loss:0.11368986861924547,acc:0.9583595772972217

    while True:
        text = input("请输入:")
        text_index = [[word2id.get(i,word2id["<PAD>"]) for i in text]]
        text_index = torch.tensor(text_index,dtype=torch.int64)
        predict=model.forward(text_index)
        predict = torch.argmax(predict, dim=-1).reshape(-1)
        pre = [id2tag[i] for i in predict]

        print([f'{w}_{s}' for w,s in zip(text,pre)])

效果不太理想

请输入:一是加快和海口市政府及有关部门沟通衔接,就桂林洋校区的围墙边界、规划区土地使用等问题达成一致意见。二是组织地理与环境科学学院、生命科学学院、体育学院等有关教学单位实地查看,有效利用未开发的区域,作为培养学生的实训和实验基地。三是按照校园总体规划,进一步加强桂林洋校区基础设施的建设和管理工作,同时打造和谐向上的校园文化。
['一_O', '是_O', '加_O', '快_O', '和_O', '海_O', '口_M-ORG', '市_M-ORG', '政_M-ORG', '府_E-ORG', '及_O', '有_O', '关_O', '部_O', '门_O', '沟_O', '通_O', '衔_M-TITLE', '接_O', ',_O', '就_O', '桂_O', '林_O', '洋_O', '校_O', '区_O', '的_O', '围_O', '墙_O', '边_O', '界_O', '、_O', '规_B-ORG', '划_M-ORG', '区_M-ORG', '土_M-ORG', '地_M-ORG', '使_O', '用_O', '等_O', '问_O', '题_O', '达_O', '成_O', '一_O', '致_O', '意_O', '见_O', '。_O', '二_O', '是_O', '组_O', '织_O', '地_O', '理_O', '与_O', '环_O', '境_O', '科_O', '学_O', '学_O', '院_O', '、_O', '生_O', '命_M-ORG', '科_M-ORG', '学_M-ORG', '学_M-ORG', '院_E-ORG', '、_O', '体_B-ORG', '育_M-ORG', '学_M-ORG', '院_E-ORG', '等_O', '有_O', '关_O', '教_O', '学_O', '单_O', '位_O', '实_M-EDU', '地_M-EDU', '查_O', '看_O', ',_O', '有_O', '效_O', '利_O', '用_O', '未_O', '开_O', '发_O', '的_O', '区_O', '域_O', ',_O', '作_O', '为_O', '培_O', '养_O', '学_O', '生_O', '的_O', '实_O', '训_O', '和_O', '实_O', '验_O', '基_O', '地_O', '。_O', '三_O', '是_O', '按_O', '照_O', '校_O', '园_O', '总_O', '体_O', '规_O', '划_O', ',_O', '进_O', '一_O', '步_O', '加_O', '强_O', '桂_O', '林_O', '洋_O', '校_O', '区_O', '基_O', '础_O', '设_O', '施_E-TITLE', '的_O', '建_O', '设_O', '和_O', '管_O', '理_O', '工_O', '作_O', ',_O', '同_O', '时_O', '打_O', '造_O', '和_O', '谐_O', '向_M-ORG', '上_M-ORG', '的_O', '校_O', '园_E-ORG', '文_O', '化_E-TITLE', '。_O']
请输入:会议指出,桂林洋校区基础设施建设已经取得重要进展,学生公寓、第二公共教学楼等设施相继投入使用,进一步改善了学生的学习和生活条件。为进一步落实省委“能力提升年”的安排部署,切实加快基础设施建设,更好服务全校师生。
['会_O', '议_O', '指_O', '出_O', ',_O', '桂_O', '林_M-ORG', '洋_M-ORG', '校_M-ORG', '区_O', '基_M-ORG', '础_M-ORG', '设_M-ORG', '施_M-ORG', '建_M-ORG', '设_M-ORG', '已_M-ORG', '经_M-ORG', '取_O', '得_O', '重_O', '要_O', '进_O', '展_O', ',_O', '学_O', '生_O', '公_O', '寓_O', '、_O', '第_B-ORG', '二_O', '公_M-ORG', '共_O', '教_O', '学_O', '楼_O', '等_O', '设_O', '施_O', '相_O', '继_M-ORG', '投_E-ORG', '入_O', '使_O', '用_E-TITLE', ',_O', '进_O', '一_O', '步_O', '改_O', '善_O', '了_O', '学_O', '生_O', '的_O', '学_O', '习_O', '和_O', '生_O', '活_O', '条_O', '件_O', '。_O', '为_O', '进_O', '一_O', '步_O', '落_O', '实_O', '省_O', '委_O', '“_O', '能_O', '力_O', '提_O', '升_O', '年_O', '”_O', '的_O', '安_O', '排_O', '部_O', '署_O', ',_O', '切_O', '实_O', '加_O', '快_O', '基_M-ORG', '础_O', '设_O', '施_M-TITLE', '建_E-ORG', '设_E-ORG', ',_O', '更_O', '好_O', '服_O', '务_O', '全_M-TITLE', '校_M-TITLE', '师_O', '生_O', '。_O']
请输入:校领导过建春、刁晓平、陈险峰、李森、韩尚峰、刘汝兵、黄忆军、王任斌调研桂林洋校区基建工作并召开现场办公会,相关职能部门负责人参加调研
['校_O', '领_O', '导_O', '过_O', '建_O', '春_O', '、_O', '刁_O', '晓_M-ORG', '平_M-ORG', '、_O', '陈_M-ORG', '险_M-ORG', '峰_M-ORG', '、_O', '李_M-ORG', '森_M-ORG', '、_O', '韩_O', '尚_M-ORG', '峰_M-ORG', '、_M-ORG', '刘_M-ORG', '汝_M-ORG', '兵_E-NAME', '、_O', '黄_M-ORG', '忆_M-ORG', '军_M-ORG', '、_M-ORG', '王_M-ORG', '任_M-ORG', '斌_M-ORG', '调_M-ORG', '研_M-ORG', '桂_M-ORG', '林_M-ORG', '洋_M-ORG', '校_M-ORG', '区_M-ORG', '基_M-ORG', '建_M-ORG', '工_M-ORG', '作_M-ORG', '并_M-ORG', '召_M-ORG', '开_M-ORG', '现_O', '场_M-TITLE', '办_M-TITLE', '公_M-TITLE', '会_E-TITLE', ',_O', '相_O', '关_O', '职_O', '能_M-TITLE', '部_M-TITLE', '门_M-TITLE', '负_M-TITLE', '责_M-TITLE', '人_E-TITLE', '参_O', '加_O', '调_M-TITLE', '研_M-TITLE']

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值