bert 是单标签还是多标签的分类_项目实践 | BERT的多标签分类实现

最新推荐文章于 2023-01-31 14:02:45 发布

醉卧南山下

最新推荐文章于 2023-01-31 14:02:45 发布

阅读量386

点赞数

文章标签： bert 是单标签还是多标签的分类

本文链接：https://blog.csdn.net/weixin_30510787/article/details/112262868

版权

开源代码：

一、多标签（multi-label）分类任务和多分类(multi-class)任务的区别

多标签分类任务中一条数据可能有多个标签，每个标签可能有多个类别，比如一个算法工程师的简历，可能被归类为自然语言处理，推荐系统等多个标签中。
多分类任务中一条数据只有一个标签，但这个标签可能有多种类别，比如判定某个高中生的年级，只能归类为高一，高二，高三其中一个。

二、代码

环境：PyCharm
第一步：建立classify文件夹。
第二步：进入classify文件夹，把Bert模型从Github上Clone下来。

git clone https://github.com/google-research/bert.git

第二步：对输入数据进行预处理，建立processor.py。

import os
import tensorflow as tf
import csv
import classify.bert.tokenization as tokenization

class InputExample(object):
  def __init__(self, guid, text_a, text_b=None, label=None):
    self.guid = guid
    self.text_a = text_a
    self.text_b = text_b
    self.label = label

class PaddingInputExample(object):
class InputFeatures(object):
  def __init__(self,inpit_ids,input_mask,segment_ids,label_id,is_real_example=True):
    self.input_ids = input_ids
    self.input_mask = input_mask
    self.segment_ids = segment_ids
    self.label_id = label_id
    self.is_real_example = is_real_example

class MyProcessor():
 
  def get_train_examples(self,data_dir):
    return self._create_examples(
         self._read_tsv(os.path.join(data_dir,"train.tsv")),"train")
 
  def get_dev_examples(self,data_dir):
    return self._create_examples(
         self._read_tsv(os.path.join(data_dir,"dev.tsv")),"dev")
  
  def get_test_examples(self.data_dir):
    return self._create_examples(
         self._read_tsv(os.path.join(data_dir ,"test.tsv"))."test")
  
  def get_labels(self):
    return ["0","1","2"]     
  
  def _read_tsv(self,input_file,quotechar=None):
    with tf.gfile.Open(input_file,"r") as f:
      reader = csv.read(f,delimiter="t",quotechar=quotechar
      lines = []
      for line in reader:
        lines.append(line)
      return lines
 
  def _create_examples(self,lines,set_type):
    examples = []
    print(len(lines))
    for (i,line) in enumerate(lines):
        if i == 0:
            continue
        guid = "%s-%s" % (set_type, i)  
        text_a = tokenization.convert_to_unicode(line[i])  
        text_b = tokenization.convert_to_unicode(line[0]) 
        examples.append(InputExample(guid=guid,text_a=text_a,text_b=None,label=label))
    return examples

  def _truncate_seq_pair(tokens_a,tokens_b,max_length):
      while True:
          total_length = len(tokens_a) + len(tokens_b)
          if total_length <= max_length:
              break
          if len(tokens_a) > len(tokens_b)
              tokens_a.pop()
          else:
              tokens_b.pop()

  import pickle
  def convert_single_example_simple(max_seq_length,tokenizer,text_a,text_b=None):
    tokens_a = tokenizer.tokenize(text_a)
    tokens_b = None
    if text_b:
      tokens_b = tokenizer.tokenize(text_b)
    if tokens_b:
      _truncate_seq_pair(tokens_a, tokens_b, max_seq_lengh - 3)
    else:
      if len(tokens_a) > max_seq_length - 2:
        tokens_a = tokens_a[0:(max_seq_length - 2)]
      tokens = []
      segment_ids = []
      tokens.append("[CLS]")
      segment_ids.append(0)
      for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
      tokens.append("[SEP]")
      segment_ids.append(0)
      if tokens_b:
        for token in tokens_b:
          tokens.append(token)
          segment_ids.append(0)
        tokens.append("[SEP]")
        segment_ids.append(1)
      input_ids = tokenizer.convert_tokens_to_ids(tokens)
      input_mask = [1] * len(input_ids)
      while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)  
      assert len(input_ids) == max_seq_length
      assert len(input_mask) == max_seq_length
      assert len(segment_ids) == max_seq_length      
      return input_ids,input_mask,segment_ids

def convert_single_example(ex_index,example,label_list,max_seq_length,tokenizer):
  if isinstance(example,PaddingInputExample):
    return InputFeatures(
        input

第三步：对数据进行batch和shuffle，建立text_loader.py。

import numpy as np

class Textloader(object):
     def __init__(self,dataSet,batch_size):
         self.data = dataSet
         self.batch_size = batch_size
         self.shuff()

     def shuff(self):
         self.num_batches = int(len(self.data) // self.batch_size)
         if self.num_batches == 0:
             assert False, 'Not enough data, make batch_size small.'
         np.random.shuffle(self.data)   
     
     def next_batch(self, k):
         x = []
         y = []
         for i in range(self.batch_size)
             tmp = list(self.data)[k*self.batch_size + i][:3] 
             x.append(tmp)
             y_ = list(self.data)[k*self.batch_size + i][3]
             y.append(y_)
             x = np.array(x)
             y = np.array(y)
             y = y.reshape(self.batch_size,1)
             return x,y

第四步：调用Bert模型，建立model.py。

import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
import classify.bert.modeling
from classify.text_loader import TextLoader
import numpy as np

class Project_model():
    def __init__(self, bert_root,data_path,temp_path,model_save_path,batch_size,max_len,lr,keep_prob):
        self.bert_root = bert_root
        self.data_path = data_path
        self.temp_path = temp_path
        self.model_save_path = model_save_path
        self.batch_size = batch_size 
        self.max_len = max_len
        self.lr = lr
        self.keep_prob = keep_prob

        self.bert_config()
        self.get_output()
        self.get_accuracy()
        self.get_trainOp()
        
    def bert_config(self):
        bert_config_file = os.path.join(self.bert_root,'bert_config.json')
        self.bert_config = modeling.BertConfig.from_json_file(bert_config_file)
        self.init_chekpoint = os.path.join(self.bert_root,'bert_model.ckpt')
        self.bert_vocab_file = os.path.join(self.bert_root,'vocab.txt')
        
        self.input_ids = tf.placeholder()
        self.input_mask = tf.placeholder()
        self.segment_ids = tf.placehoder()
        self.input_y = tf.placeholder()
        
        self.global_step = tf.Variable(0, trainable = False)
        
        output_weights = tf.get_variable(
            "output_weights", [3,768],
            initializer = tf.random_normal_initializer(stddev=0.1))
        outputbias = tf.get_variable(
            "output_bias",[3],initializer = tf.random_normal_initializer(stddev=0.1)) 
        
        self.w_out = output_weights
        self.b_out = output_bias
        model = modeling.BertModel(
            config = self.bert_config,
            is_training = False,
            input_ids = self.input_ids,
            input_mask = self.input_mask,
            token_type_ids = self.segment_ids,
            use_one_hot_embeddings = False) 
            tvars = tf.trainable_variables()  
            (assignment,initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,self.init_checkpoint)
            tf.train.init_from_checkpoint(self.init_checkpoint,assignment)
            output_layer_pooled = model.get_pooled_output()
            self.output_layer_pooled = tf.nn.dropout(output_layer_pooled,keep_prob=self.keep_prob)

    def get_output(self):
        logits = tf.matmul(self.output_layer_pooled, self.w_out, transpose_b=True)
        self.logits = tf.nn.bias_add(logits,self.b_out)
        self.probabilities = tf.nn.softmax(self.logits,axis=-1,name='y')
        self.log_probs = tf.nn.log_softmax(self.logits,axis=-1) #logsoftmax = logits - log(reduce_sum(exp(logits), axis))
        one_hot_labels = tf.one_hot(self.input_y,depth=3,dtype=tf.float32)
        per_example_loss = -tf.reduce_sum(one_hot_labels * self.log_probs,axis=-1)
        self.loss = tf.reduce_mean(per_example_loss)
        tf.summary.scalar('loss',self.loss)
        
    def get_accuracy(self):
        self.predicts = tf.argmax(self.logits,axis = -1)
        self.actuals = tf.argmax(self.input_y,axis = -1)
        self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.predicts,self.actuals),dtype=tf.float32))
        tf.summmary.scalar('accuracy',self.accuracy)
        self.merged = tf.summary.merge_all()
    
    def get_trainOp(self):
        self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
        return self.train_op
    
    def evaluate(self,sess,devdata):
        data_loader = TextLoader(devdata,self.batch_size)
        accuracies = []
        for i in range(data_loader.num_batches):
            x_train, y_train = data_loader.next_batch(i)
            x_input_ids = x_train[:,0]
            x_input_mask = x_train[:,1]
            x_segment_ids = x_train[:,2]
            feed_dict = {self.input_ids:x_input_ids,
                         self.input_mask:x_input_mask,
                         self.segment_ids:x_segment_ids,
                         self.input_y:y_train}
            accuracy = sess.run(self.accuracy, feed_dict = feed_dict)
            accuracies.append(accuracy)
        acc = np.mean(accuracies) * 100
        return acc
   
    def run_step(self,sess,x_train,y_train):
        x_input_ids = x_train[:,0]   
        x_input_mask = x_train[:,1] 
        x_segment_ids = x_train[:,2]
        step,loss_,_,log = sess.run([self.global_step,self.loss,self.merged],
                                           feed_dict={self.input_ids:x_input_ids,self.input_mask:x_input_mask,
                                                      self.segment_ids:x_segment_ids,
                                                      self.input_y:y_train})

第五步：建立主函数，建立run.py。

参考：
(1)多标签学习综述（A review on multi-label learning algorithms）

(2)深度学习模型处理多标签（multi_label）分类任务——keras实战

(3)NLP必读：十分钟读懂谷歌BERT模型