本文是paper“A Convolutional Neural Network for Modelling Sentences”基于TensorFlow的实现方法,代码和数据集都可以到我的github上面进行下载。
NUM:date When did Hawaii become a state ?
def clean_str(string):
string = re.sub(r"[^A-Za-z0-9:(),!?\'\`]", " ", string)
string = re.sub(r" : ", ":", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def load_data_and_labels():
Loads data from files, splits the data into words and generates labels.
Returns split sentences and labels.
# Load data from files
folder_prefix = 'data/'
x_train = list(open(folder_prefix+"train").readlines())
x_test = list(open(folder_prefix+"test").readlines())
test_size = len(x_test)
x_text = x_train + x_test
x_text = [clean_str(sent) for sent in x_text]
y = [s.split(' ')[0].split(':')[0] for s in x_text]
x_text = [s.split(" ")[1:] for s in x_text]
# Generate labels
all_label = dict()
for label in y:
if not label in all_label:
all_label[label] = len(all_label) + 1
one_hot = np.identity(len(all_label))
y = [one_hot[ all_label[label]-1 ] for label in y]
return [x_text, y, test_size]
def pad_sentences(sentences, padding_word="<PAD/>"):
Pads all sentences to the same length. The length is defined by the longest sentence.
Returns padded sentences.
sequence_length = max(len(x) for x in sentences)
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
return padded_sentences
def build_vocab