开源代码:
一、多标签(multi-label)分类任务 和 多分类(multi-class)任务的区别
- 多标签分类任务中一条数据可能有多个标签,每个标签可能有多个类别,比如一个算法工程师的简历,可能被归类为自然语言处理,推荐系统等多个标签中。
- 多分类任务中一条数据只有一个标签,但这个标签可能有多种类别,比如判定某个高中生的年级,只能归类为高一,高二,高三其中一个。
二、代码
环境:PyCharm
第一步:建立classify文件夹。
第二步:进入classify文件夹,把Bert模型从Github上Clone下来。
git clone https://github.com/google-research/bert.git
第二步:对输入数据进行预处理,建立processor.py。
import os
import tensorflow as tf
import csv
import classify.bert.tokenization as tokenization
class InputExample(object):
def __init__(self, guid, text_a, text_b=None, label=None):
self.guid = guid
self.text_a = text_a
self.text_b = text_b
self.label = label
class PaddingInputExample(object):
class InputFeatures(object):
def __init__(self,inpit_ids,input_mask,segment_ids,label_id,is_real_example=True):
self.input_ids = input_ids
self.input_mask = input_mask
self.segment_ids = segment_ids
self.label_id = label_id
self.is_real_example = is_real_example
class MyProcessor():
def get_train_examples(self,data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir,"train.tsv")),"train")
def get_dev_examples(self,data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir,"dev.tsv")),"dev")
def get_test_examples(self.data_dir):
return self._create_examples(
self._read_tsv(os.path.join(data_dir ,"test.tsv"))."test")
def get_labels(self):
return ["0","1","2"]
def _read_tsv(self,input_file,quotechar=None):
with tf.gfile.Open(input_file,"r") as f:
reader = csv.read(f,delimiter="t",quotechar=quotechar
lines = []
for line in reader:
lines.append(line)
return lines
def _create_examples(self,lines,set_type):
examples = []
print(len(lines))
for (i,line) in enumerate(lines):
if i == 0:
continue
guid = "%s-%s" % (set_type, i)
text_a = tokenization.convert_to_unicode(line[i])
text_b = tokenization.convert_to_unicode(line[0])
examples.append(InputExample(guid=guid,text_a=text_a,text_b=None,label=label))
return examples
def _truncate_seq_pair(tokens_a,tokens_b,max_length):
while True:
total_length = len(tokens_a) + len(tokens_b)
if total_length <= max_length:
break
if len(tokens_a) > len(tokens_b)
tokens_a.pop()
else:
tokens_b.pop()
import pickle
def convert_single_example_simple(max_seq_length,tokenizer,text_a,text_b=None):
tokens_a = tokenizer.tokenize(text_a)
tokens_b = None
if text_b:
tokens_b = tokenizer.tokenize(text_b)
if tokens_b:
_truncate_seq_pair(tokens_a, tokens_b, max_seq_lengh - 3)
else:
if len(tokens_a) > max_seq_length - 2:
tokens_a = tokens_a[0:(max_seq_length - 2)]
tokens = []
segment_ids = []
tokens.append("[CLS]")
segment_ids.append(0)
for token in tokens_a:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(0)
if tokens_b:
for token in tokens_b:
tokens.append(token)
segment_ids.append(0)
tokens.append("[SEP]")
segment_ids.append(1)
input_ids = tokenizer.convert_tokens_to_ids(tokens)
input_mask = [1] * len(input_ids)
while len(input_ids) < max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
assert len(input_ids) == max_seq_length
assert len(input_mask) == max_seq_length
assert len(segment_ids) == max_seq_length
return input_ids,input_mask,segment_ids
def convert_single_example(ex_index,example,label_list,max_seq_length,tokenizer):
if isinstance(example,PaddingInputExample):
return InputFeatures(
input
第三步:对数据进行batch和shuffle,建立text_loader.py。
import numpy as np
class Textloader(object):
def __init__(self,dataSet,batch_size):
self.data = dataSet
self.batch_size = batch_size
self.shuff()
def shuff(self):
self.num_batches = int(len(self.data) // self.batch_size)
if self.num_batches == 0:
assert False, 'Not enough data, make batch_size small.'
np.random.shuffle(self.data)
def next_batch(self, k):
x = []
y = []
for i in range(self.batch_size)
tmp = list(self.data)[k*self.batch_size + i][:3]
x.append(tmp)
y_ = list(self.data)[k*self.batch_size + i][3]
y.append(y_)
x = np.array(x)
y = np.array(y)
y = y.reshape(self.batch_size,1)
return x,y
第四步:调用Bert模型,建立model.py。
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
import classify.bert.modeling
from classify.text_loader import TextLoader
import numpy as np
class Project_model():
def __init__(self, bert_root,data_path,temp_path,model_save_path,batch_size,max_len,lr,keep_prob):
self.bert_root = bert_root
self.data_path = data_path
self.temp_path = temp_path
self.model_save_path = model_save_path
self.batch_size = batch_size
self.max_len = max_len
self.lr = lr
self.keep_prob = keep_prob
self.bert_config()
self.get_output()
self.get_accuracy()
self.get_trainOp()
def bert_config(self):
bert_config_file = os.path.join(self.bert_root,'bert_config.json')
self.bert_config = modeling.BertConfig.from_json_file(bert_config_file)
self.init_chekpoint = os.path.join(self.bert_root,'bert_model.ckpt')
self.bert_vocab_file = os.path.join(self.bert_root,'vocab.txt')
self.input_ids = tf.placeholder()
self.input_mask = tf.placeholder()
self.segment_ids = tf.placehoder()
self.input_y = tf.placeholder()
self.global_step = tf.Variable(0, trainable = False)
output_weights = tf.get_variable(
"output_weights", [3,768],
initializer = tf.random_normal_initializer(stddev=0.1))
outputbias = tf.get_variable(
"output_bias",[3],initializer = tf.random_normal_initializer(stddev=0.1))
self.w_out = output_weights
self.b_out = output_bias
model = modeling.BertModel(
config = self.bert_config,
is_training = False,
input_ids = self.input_ids,
input_mask = self.input_mask,
token_type_ids = self.segment_ids,
use_one_hot_embeddings = False)
tvars = tf.trainable_variables()
(assignment,initialized_variable_names) = modeling.get_assignment_map_from_checkpoint(tvars,self.init_checkpoint)
tf.train.init_from_checkpoint(self.init_checkpoint,assignment)
output_layer_pooled = model.get_pooled_output()
self.output_layer_pooled = tf.nn.dropout(output_layer_pooled,keep_prob=self.keep_prob)
def get_output(self):
logits = tf.matmul(self.output_layer_pooled, self.w_out, transpose_b=True)
self.logits = tf.nn.bias_add(logits,self.b_out)
self.probabilities = tf.nn.softmax(self.logits,axis=-1,name='y')
self.log_probs = tf.nn.log_softmax(self.logits,axis=-1) #logsoftmax = logits - log(reduce_sum(exp(logits), axis))
one_hot_labels = tf.one_hot(self.input_y,depth=3,dtype=tf.float32)
per_example_loss = -tf.reduce_sum(one_hot_labels * self.log_probs,axis=-1)
self.loss = tf.reduce_mean(per_example_loss)
tf.summary.scalar('loss',self.loss)
def get_accuracy(self):
self.predicts = tf.argmax(self.logits,axis = -1)
self.actuals = tf.argmax(self.input_y,axis = -1)
self.accuracy = tf.reduce_mean(tf.cast(tf.equal(self.predicts,self.actuals),dtype=tf.float32))
tf.summmary.scalar('accuracy',self.accuracy)
self.merged = tf.summary.merge_all()
def get_trainOp(self):
self.train_op = tf.train.AdamOptimizer(self.lr).minimize(self.loss)
return self.train_op
def evaluate(self,sess,devdata):
data_loader = TextLoader(devdata,self.batch_size)
accuracies = []
for i in range(data_loader.num_batches):
x_train, y_train = data_loader.next_batch(i)
x_input_ids = x_train[:,0]
x_input_mask = x_train[:,1]
x_segment_ids = x_train[:,2]
feed_dict = {self.input_ids:x_input_ids,
self.input_mask:x_input_mask,
self.segment_ids:x_segment_ids,
self.input_y:y_train}
accuracy = sess.run(self.accuracy, feed_dict = feed_dict)
accuracies.append(accuracy)
acc = np.mean(accuracies) * 100
return acc
def run_step(self,sess,x_train,y_train):
x_input_ids = x_train[:,0]
x_input_mask = x_train[:,1]
x_segment_ids = x_train[:,2]
step,loss_,_,log = sess.run([self.global_step,self.loss,self.merged],
feed_dict={self.input_ids:x_input_ids,self.input_mask:x_input_mask,
self.segment_ids:x_segment_ids,
self.input_y:y_train})
第五步:建立主函数,建立run.py。
参考:
(1)多标签学习综述(A review on multi-label learning algorithms)
(2)深度学习模型处理多标签(multi_label)分类任务——keras实战
(3)NLP必读:十分钟读懂谷歌BERT模型