【BERT系列】——文本分类

本文是BERT实战的第一篇,使用BERT进行文本分类。

1. 准备

1.1 环境

  • python 3.7
  • pytorch 1.3
  • transformers 2.3安装教程);

1.2 数据

  • 使用新浪新闻数据(链接:https://pan.baidu.com/s/1-Lck_ivs2ryBcrXY0HoR_g
    提取码:2uwc);

2. 实战

2.1 训练代码

# batch_size
batch_size = 8
# 学习率
lr = 1e-5
# 是否使用gpu
cuda = False
# 训练批次
epoches = 20
# sequence 最大长度
max_length = 256

# 得到attention mask
def get_atten_mask(tokens_ids, pad_index=0):
    return list(map(lambda x: 1 if x != pad_index else 0, tokens_ids))


# 类别: id
news_type2id_dict = {'娱乐': 0, '财经': 1, '体育': 2, '家居': 3, '教育': 4, '房产': 5, '时尚': 6, '游戏': 7, '科技': 8, '时政': 9}


class NewsDataset(Dataset):

    def __init__(self, file_path, tokenizer: BertTokenizer, max_length=512, device=None):
        news_type = []
        news_content = []
        news_atten_mask = []
        seq_typ_ids = []
        with open(file_path, mode='r', encoding='utf8') as f:
            for line in tqdm(f.readlines()):
                line = line.strip()
                line = line.split('\t')

                news_type.append(news_type2id_dict[line[0]])
                token_ids = tokenizer.encode(ILLEGAL_CHARACTERS_RE.sub(r'', line[1]), max_length=max_length,
                                             pad_to_max_length=True)
                news_content.append(token_ids)
                news_atten_mask.append(get_atten_mask(token_ids))
                seq_typ_ids.append(tokenizer.create_token_type_ids_from_sequences(token_ids_0=token_ids[1:-1]))

        self.label = torch.from_numpy(np.array(news_type)).unsqueeze(1).long()
        self.token_ids = torch.from_numpy(np.array(news_content)).long()
        self.seq_type_ids = torch.from_numpy(np.array(seq_typ_ids)).long()
        self.atten_masks = torch.from_numpy(np.array(news_atten_mask)).long()
        if device is not None:
            self.label = self.label.to(device)
            self.token_ids = self.token_ids.to(device)
            self.seq_type_ids = self.seq_type_ids.to(device)
            self.atten_masks = self.atten_masks.to(device)

    def __len__(self):
        return self.label.shape[0]

    def __getitem__(self, item):
        return self.label[item], self.token_ids[item], self.seq_type_ids[item], self.atten_masks[item]


def train(train_dataset, model: BertForSequenceClassification, optimizer: AdamW, batch_size=batch_size):
    train_sampler = RandomSampler(train_dataset)
    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)
    model.train()
    tr_loss = 0.0
    tr_acc = 0
    global_step = 0
    if cuda:
        torch.cuda.empty_cache()
    for step, batch in tqdm(enumerate(train_loader)):
        # print(step)
        inputs = {
            'input_ids': batch[1],
            'token_type_ids': batch[2],
            'attention_mask': batch[3],
            'labels': batch[0]
        }
        outputs = model(**inputs)
        loss = outputs[0]
        # print(loss)
        logits = outputs[1]

        tr_loss += loss.item()

        model.zero_grad()
        loss.backward()
        optimizer.step()
        # 计算准确率
        _, pred = logits.max(1)
        number_corr = (pred == batch[0].view(-1)).long().sum().item()
        tr_acc += number_corr
        global_step += 1

    return tr_loss / global_step, tr_acc / len(train_dataset)


def evalate(eval_dataset, model: BertForSequenceClassification, batch_size=batch_size):
    model.eval()
    eval_sampler = RandomSampler(eval_dataset)
    eval_loader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=batch_size)
    tr_acc = 0
    if cuda:
        torch.cuda.empty_cache()
    for step, batch in tqdm(enumerate(eval_loader)):
        inputs = {
            'input_ids': batch[1],
            'token_type_ids': batch[2],
            'attention_mask': batch[3],
            'labels': batch[0]
        }
        outputs = model(**inputs)
        # loss = outputs[0]
        logits = outputs[1]

        # tr_loss += loss.item()

        # 计算准确率
        _, pred = logits.max(1)
        number_corr = (pred == batch[0].view(-1)).long().sum().item()
        tr_acc += number_corr

    return tr_acc / len(eval_dataset)


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


if __name__ == '__main__':
    device = torch.device('cuda' if cuda else 'cpu')
    # 创建config
    config = BertConfig.from_pretrained('./model/bert_config.json', num_labels=len(news_type2id_dict))
    # 创建分类器
    classifier = BertForSequenceClassification.from_pretrained('./model/pytorch_model.bin', config=config).to(device)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in classifier.named_parameters() if not any(nd in n for nd in no_decay)],
         'weight_decay': 0.0},
        {'params': [p for n, p in classifier.named_parameters() if any(nd in n for nd in no_decay)],
         'weight_decay': 0.0}
    ]

    # 创建tokenizer
    tokenizer = BertTokenizer('./model/vocab.txt')

    optimizer = AdamW(optimizer_grouped_parameters, lr=lr, eps=1e-8)

    logger.info('create train dataset')

    train_dataset = NewsDataset('./data/cnews/cnews.train.txt', tokenizer, max_length=max_length,
                                device=device)
    logger.info('create eval dataset')
    eval_dataset = NewsDataset('./data/cnews/cnews.val.txt', tokenizer, max_length=max_length,
                               device=device)
    best_val_acc = 0.0
    for e in range(1, epoches):
        start_time = time.time()
        train_loss, train_acc = train(train_dataset, classifier, optimizer, batch_size)
        eval_acc = evalate(eval_dataset, classifier, batch_size)
        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        logger.info('Epoch: {:02} | Time: {}m {}s'.format(e, epoch_mins, epoch_secs))
        logger.info(
            'Train Loss: {:.6f} | Train Acc: {:.6f} | Eval Acc: {:.6f}'.format(train_loss, train_acc, eval_acc))
        if eval_acc > best_val_acc:
            best_val_acc = eval_acc
            torch.save(classifier.state_dict(), './fine_tune_model/model_{}'.format(e))

2.2 预测代码

news_type_id_dict = {'娱乐': 0, '财经': 1, '体育': 2, '家居': 3, '教育': 4, '房产': 5, '时尚': 6, '游戏': 7, '科技': 8, '时政': 9}

news_id_type_dict = {v: k for k, v in news_type_id_dict.items()}

def get_atten_mask(tokens_ids, pad_index=0):
    return list(map(lambda x: 1 if x != pad_index else 0, tokens_ids))


config = BertConfig.from_pretrained('./model/bert_config.json', num_labels=len(news_type_id_dict))

classifier = BertForSequenceClassification.from_pretrained('./fine_tune_model/model_7', config=config)
classifier.eval()

tokenizer = BertTokenizer('./model/vocab.txt')
index = 0
def predict(text):
    global index
    text = str(text).strip()
    token_ids = tokenizer.encode(ILLEGAL_CHARACTERS_RE.sub(r'', text), max_length=256,
                                 pad_to_max_length=True)
    token_mask = get_atten_mask(token_ids)

    token_segment_type = tokenizer.create_token_type_ids_from_sequences(token_ids_0=token_ids[1:-1])

    token_ids = torch.LongTensor(token_ids).unsqueeze(0)
    token_mask = torch.LongTensor(token_mask).unsqueeze(0)
    token_segment_type = torch.LongTensor(token_segment_type).unsqueeze(0)

    inputs = {
        'input_ids': token_ids,
        'token_type_ids': token_segment_type,
        'attention_mask': token_mask,
        # 'labels': batch[0]
    }
    logits = classifier(**inputs)
    _, predict = logits[0].max(1)
    # print(str(index) + news_id_type_dict[predict.item()])
    index += 1
    return news_id_type_dict[predict.item()]


if __name__ == '__main__':

    news = '''
    对于我国的科技巨头华为而言,2019年注定是不平凡的一年,由于在5G领域遥遥领先于其他国家,华为遭到了不少方面的觊觎,并因此承受了太多不公平地对待,在零部件供应、核心技术研发、以及市场等多个领域受到了有意打压。
    但是华为并没有因此而一蹶不振,而是亮出了自己的一张又一张“底牌”,随着麒麟处理器、海思半导体以及鸿蒙操作系统的闪亮登场,华为也向世界证明了自己的实力,上演了一场几乎完美的绝地反击。
    '''
    print(predict(news))

3. 效果

  • 学习率使用1e-5,训练集准确率99.6%,测试集准确率97.26%
  • 3
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
PyTorchBERT中文文本分类是一个存储库,包含了用于中文文本分类的预训练BERT模型的PyTorch实现。该存储库的代码结构包括了pybert和callback两个文件夹,其中pybert文件夹包含了与BERT模型相关的代码文件,而callback文件夹包含了与训练过程中的回调函数相关的代码文件。 首先,构造输入样本,然后进行分词和词向序号的转换。通过使用BertTokenizer.from_pretrained(model_name)方法,我们可以加载预训练的BERT模型,并将输入样本进行分词和词向序号的转换。样本经过分词后,通过tokenizer.convert_tokens_to_ids方法将分词后的文本转换为对应的词向序号,最后将转换后的输入样本转换为torch.LongTensor类型的张量。 与构建训练集数据迭代器类似,我们还需要构建验证集的数据迭代器。首先,将验证集的样本进行分词和词向序号的转换,然后将转换后的输入样本和对应的标签封装为TensorDataset类型的数据集。最后,使用DataLoader方法构建数据迭代器,设置batch_size为1,shuffle参数为True,以便在验证过程中对数据进行洗牌。 总结来说,PyTorchBERT中文文本分类是一个用于中文文本分类的预训练BERT模型的PyTorch实现。我们可以通过构造输入样本和构建数据迭代器来进行文本分类任务。<span class="em">1</span><span class="em">2</span><span class="em">3</span> #### 引用[.reference_title] - *1* [BERT-中文文本分类-pytorch:此存储库包含用于文本分类的预训练BERT模型的PyTorch实现](https://download.csdn.net/download/weixin_42107561/15015956)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 50%"] - *2* *3* [Pytorch——BERT 预训练模型及文本分类(情感分类)](https://blog.csdn.net/qq_38563206/article/details/121106374)[target="_blank" data-report-click={"spm":"1018.2226.3001.9630","extra":{"utm_source":"vip_chatgpt_common_search_pc_result","utm_medium":"distribute.pc_search_result.none-task-cask-2~all~insert_cask~default-1-null.142^v93^chatsearchT3_1"}}] [.reference_item style="max-width: 50%"] [ .reference_list ]
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值