一、TextVectorization+Enbedding+LSTM
使用TextVectorization层转化数据。
import tensorflow as tf
import keras
from keras.layers import Conv1D, Dense, Embedding, GlobalAveragePooling1D, Activation, BatchNormalization, Layer, \
Dropout, LSTM, TextVectorization, Input
class NCModel(keras.Model):
def __init__(self, vocab_size, max_length,vocab,classes, *args, **kwargs):
super().__init__(*args, **kwargs)
# preprocessing layers
i = Input(shape=(1), dtype=tf.string)
tv = TextVectorization(max_tokens=vocab_size, output_mode='int', output_sequence_length=max_length)
tv.adapt(vocab)
self.pp = keras.Sequential(
layers=[tv,Embedding(input_dim=vocab_size,output_dim=64,input_length=max_length)]
)
self.lstm = LSTM(32)
self.fc = Dense(classes,activation="softmax")
self.call(i)
def call(self, inputs, training=None, mask=None):
outputs = self.pp(inputs)
outputs = self.lstm(outputs)
outputs = self.fc(outputs)
return outputs
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from NCModel import NCModel
import pandas as pd
using_gpu_index = 0
gpu_list = tf.config.experimental.list_physical_devices('GPU')
if len(gpu_list) > 0:
try:
tf.config.experimental.set_virtual_device_configuration(
gpu_list[using_gpu_index],
[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=10240)] # limit the size of GPU memory
)
except RuntimeError as e:
print(e)
else:
print("Got no GPUs")
# parameters
vocab_size = 7000
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
# split data
train_df = pd.read_csv("./dataset/preprocessed_train.csv")
x, y = train_df['content'].to_numpy(), train_df['category'].to_numpy().reshape(len(train_df['category']), 1)
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2, stratify=y, shuffle=True)
# model construction
model = NCModel(vocab_size=vocab_size, max_length=max_length, vocab=train_x, classes=32)
model.build((None, 1))
model.compile(loss=SparseCategoricalCrossentropy(),
optimizer=Adam(),
metrics=['accuracy'])
model.summary()
# train and evaluate
history = model.fit(train_x, train_y, batch_size=32, epochs=10)
model.evaluate(valid_x, valid_y, return_dict=True)
# accuracy: 0.6517
二、 Enbedding+LSTM
使用keras的Tokenizer转化数据。
class NCModel2(keras.Model):
def __init__(self, vocab_size, max_length,classes, *args, **kwargs):
super().__init__(*args, **kwargs)
# preprocessing layers
i = Input(shape=max_length, dtype=tf.string)
self.embedding = Embedding(input_dim=vocab_size,output_dim=64,input_length=max_length)
self.lstm = LSTM(32)
self.fc = Dense(classes,activation="softmax")
self.call(i)
def call(self, inputs, training=None, mask=None):
outputs = self.embedding(inputs)
outputs = self.lstm(outputs)
outputs = self.fc(outputs)
return outputs
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from NCModel import NCModel2
import pandas as pd
# set GPU memory, can be ignored
using_gpu_index = 0
gpu_list = tf.config.experimental.list_physical_devices('GPU')
if len(gpu_list) > 0:
try:
tf.config.experimental.set_virtual_device_configuration(
gpu_list[using_gpu_index],
[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=10240)] # limit the size of GPU memory
)
except RuntimeError as e:
print(e)
else:
print("Got no GPUs")
# parameters
vocab_size = 7000
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
# train-test split
train_df = pd.read_csv("./dataset/preprocessed_train.csv")
x, y = train_df['content'].to_numpy(), train_df['category'].to_numpy().reshape(len(train_df['category']), 1)
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2, stratify=y, shuffle=True)
# tokenize sentences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_x)
# convert train dataset to sequence and pad sequences
train_x = tokenizer.texts_to_sequences(train_x)
train_x = tf.keras.preprocessing.sequence.pad_sequences(train_x, padding=padding_type, truncating=trunc_type,
maxlen=max_length)
# convert valid dataset to sequence and pad sequences
valid_x = tokenizer.texts_to_sequences(valid_x)
valid_x = tf.keras.preprocessing.sequence.pad_sequences(valid_x, padding=padding_type, truncating=trunc_type,
maxlen=max_length)
# model construction
model = NCModel2(vocab_size=vocab_size, max_length=max_length, classes=32)
model.build((None, 200))
model.compile(loss=SparseCategoricalCrossentropy(),
optimizer=Adam(),
metrics=['accuracy'])
model.summary()
# train and evaluate
history = model.fit(train_x, train_y, batch_size=32, epochs=10)
model.evaluate(valid_x, valid_y, return_dict=True)
# accuracy: 0.6445
三、 Enbedding+Conv1D
使用keras的Tokenizer转化数据。
class NCModel3(keras.Model):
def __init__(self, vocab_size, max_length,classes, *args, **kwargs):
super().__init__(*args, **kwargs)
# preprocessing layers
i = Input(shape=max_length, dtype=tf.string)
self.embedding = Embedding(input_dim=vocab_size,output_dim=64,input_length=max_length)
self.cnn = CNNLayer(32)
self.pool = GlobalAveragePooling1D()
self.fc = Dense(classes,activation="softmax")
self.call(i)
def call(self, inputs, training=None, mask=None):
outputs = self.embedding(inputs)
outputs = self.cnn(outputs)
outputs = self.pool(outputs)
outputs = self.fc(outputs)
return outputs
class CNNLayer(Layer):
def __init__(self, filters, **kwargs):
super().__init__(**kwargs)
self.cl = Conv1D(filters=filters, kernel_size=2, strides=1, padding="same")
self.bn = BatchNormalization()
self.relu = Activation("relu")
def call(self, inputs, *args, **kwargs):
outputs = self.cl(inputs)
outputs = self.bn(outputs)
outputs = self.relu(outputs)
return outputs
from sklearn.model_selection import train_test_split
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.optimizers import Adam
from keras.losses import SparseCategoricalCrossentropy
from NCModel import NCModel3
import pandas as pd
# set GPU memory, can be ignored
using_gpu_index = 0
gpu_list = tf.config.experimental.list_physical_devices('GPU')
if len(gpu_list) > 0:
try:
tf.config.experimental.set_virtual_device_configuration(
gpu_list[using_gpu_index],
[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=10240)] # limit the size of GPU memory
)
except RuntimeError as e:
print(e)
else:
print("Got no GPUs")
# parameters
vocab_size = 7000
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = "<OOV>"
# train-test split
train_df = pd.read_csv("./dataset/preprocessed_train.csv")
x, y = train_df['content'].to_numpy(), train_df['category'].to_numpy().reshape(len(train_df['category']), 1)
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2, stratify=y, shuffle=True)
# tokenize sentences
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(train_x)
# convert train dataset to sequence and pad sequences
train_x = tokenizer.texts_to_sequences(train_x)
train_x = tf.keras.preprocessing.sequence.pad_sequences(train_x, padding=padding_type, truncating=trunc_type,
maxlen=max_length)
# convert valid dataset to sequence and pad sequences
valid_x = tokenizer.texts_to_sequences(valid_x)
valid_x = tf.keras.preprocessing.sequence.pad_sequences(valid_x, padding=padding_type, truncating=trunc_type,
maxlen=max_length)
# model construction
model = NCModel3(vocab_size=vocab_size, max_length=max_length, classes=32)
model.build((None, 200))
model.compile(loss=SparseCategoricalCrossentropy(),
optimizer=Adam(),
metrics=['accuracy'])
model.summary()
# train and evaluate
history = model.fit(train_x, train_y, batch_size=32, epochs=10)
model.evaluate(valid_x, valid_y, return_dict=True)
# accuracy: 0.6463
四、Transformer
耗费时间长,性能较好。
import pandas as pd
import numpy as np
import tensorflow as tf
import evaluate
from transformers import DataCollatorWithPadding, create_optimizer, TFAutoModelForSequenceClassification, AutoTokenizer
from transformers.keras_callbacks import KerasMetricCallback, PushToHubCallback
from keras.callbacks import ModelCheckpoint
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
# set GPU memory, can be ignored
using_gpu_index = 0
gpu_list = tf.config.experimental.list_physical_devices('GPU')
if len(gpu_list) > 0:
try:
tf.config.experimental.set_virtual_device_configuration(
gpu_list[using_gpu_index],
[tf.config.experimental.VirtualDeviceConfiguration(memory_limit=10240)] # limit the size of GPU memory
)
except RuntimeError as e:
print(e)
else:
print("Got no GPUs")
# read data
train_df = pd.read_csv("./dataset/preprocessed_train.csv")
x, y = train_df['content'].to_numpy(), train_df['category'].to_numpy()
# split data
train_x, valid_x, train_y, valid_y = train_test_split(x, y, test_size=0.2, stratify=y)
ds_train = Dataset.from_dict({"text": train_x, "label": train_y})
ds_valid = Dataset.from_dict({"text": valid_x, "label": valid_y})
ds = DatasetDict()
ds["train"] = ds_train
ds["valid"] = ds_valid
# tokenizer
def preprocess_function(examples):
return tokenizer(examples["text"], truncation=True)
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_ds = ds.map(preprocess_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")
# conversion between id and label
category_df = pd.read_csv("dataset/category_dict.csv")
id2label = {}
label2id = {}
for i in range(len(category_df)):
category_id = int(category_df["category_id"].loc[i])
category_name = category_df["category_name"].loc[i]
id2label[category_id] = category_name
label2id[category_name] = category_id
# evaluation
accuracy = evaluate.load("accuracy")
def compute_metrics(eval_pred):
predictions, labels = eval_pred
predictions = np.argmax(predictions, axis=1)
return accuracy.compute(predictions=predictions, references=labels)
# model construction
batch_size = 16
num_epochs = 5
batches_per_epoch = len(ds_train) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=2e-5, num_warmup_steps=0, num_train_steps=total_train_steps)
model = TFAutoModelForSequenceClassification.from_pretrained(
"distilbert-base-uncased", num_labels=32, id2label=id2label, label2id=label2id)
tf_train_set = model.prepare_tf_dataset(
tokenized_ds['train'],
shuffle=True,
batch_size=16,
collate_fn=data_collator,
)
tf_validation_set = model.prepare_tf_dataset(
tokenized_ds['valid'],
shuffle=False,
batch_size=16,
collate_fn=data_collator,
)
model.compile(optimizer=optimizer)
# train
metric_callback = KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set)
model_checkpoint = ModelCheckpoint("NC", save_best_only=True)
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=3, callbacks=[metric_callback, model_checkpoint])
# accuracy 0.74
所有模型示例仅提供思路,不保证具有良好性能。
GitHub(含有数据集): https://github.com/VAMPIREONETWO/News-Classification