本文是paper“A Convolutional Neural Network for Modelling Sentences”基于TensorFlow的实现方法,代码和数据集都可以到我的github上面进行下载。
数据集及处理方法
本文仿真的是论文的第二个实验,使用的数据集是TREC。该数据集是QA领域用于分类问题类型的。其中问题主要分为6大类别,比如地理位置、人、数学信息等等,这里使用one-hot编码表明其类别关系。其包含5452个标记好的训练集和500个测试集。每个样本数据如下所示,以冒号分隔,前面标示类别,后面为问题:
NUM:date When did Hawaii become a state ?
接下来介绍数据处理函数,这部分写在dataUtils.py文件中。其实和之前写的也大都差不多,都是读取文件中的句子和标签、进行PADDING、构建vocabulary、将句子转换成单词索引以方便embedding层进行转化为词向量。代码入下,已经注释的很清楚,不再进行过多介绍。使用的时候直接调用load_data()函数即可。
def clean_str(string):
string = re.sub(r"[^A-Za-z0-9:(),!?\'\`]", " ", string)
string = re.sub(r" : ", ":", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def load_data_and_labels():
"""
Loads data from files, splits the data into words and generates labels.
Returns split sentences and labels.
"""
# Load data from files
folder_prefix = 'data/'
x_train = list(open(folder_prefix+"train").readlines())
x_test = list(open(folder_prefix+"test").readlines())
test_size = len(x_test)
x_text = x_train + x_test
x_text = [clean_str(sent) for sent in x_text]
y = [s.split(' ')[0].split(':')[0] for s in x_text]
x_text = [s.split(" ")[1:] for s in x_text]
# Generate labels
all_label = dict()
for label in y:
if not label in all_label:
all_label[label] = len(all_label) + 1
one_hot = np.identity(len(all_label))
y = [one_hot[ all_label[label]-1 ] for label in y]
return [x_text, y, test_size]
def pad_sentences(sentences, padding_word="<PAD/>"):
"""
Pads all sentences to the same length. The length is defined by the longest sentence.
Returns padded sentences.
"""
sequence_length = max(len(x) for x in sentences)
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
padded_sentences.append(new_sentence)
return padded_sentences
def build_vocab