'''
This script loads pre-trained word embeddings(word2vec embeddings)
into a Keras Embedding layer, and uses it to train a text classification model on a customized dataset.
'''from __future__ import print_function
from collections import defaultdict
import os
import numpy as np
import pandas as pd
np.random.seed(1337)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.layers import Convolution1D, Dropout, Activation
from keras.models import Sequential
from keras.models import Model
w2v_file = 'G:/pre_trained word embeddings/word2vec/vectors.bin'
train_data = './cqa_title/traindata/userprofilepythontitle.txt'
test_data = './cqa_title/testdata/TestQuestionsPythonTitle.txt'
EMBEDDING_DIM = 400
MAX_SEQUENCE_LENGTH = 25
NB_FILTER = 128
FILTER_LENGTH = 5defbuild_data_cv(clean_string=False):"""
Loads data.
"""
revs = []
vocab = defaultdict(float)
with open(train_data, "r") as f:
for line in f:
rev = []
rev.append(line.strip())
if clean_string:
orig_rev = clean_str(" ".join(rev))
else:
orig_rev = " ".join(rev).lower()
words = orig_rev.split()
n_y = int(words[0])
for word in words[1:]:
vocab[word] += 1
datum = {"y":n_y-1,
"text": " ".join(words[1:]),
"num_words": len(words)-1,
"split": 0}
revs.append(datum)
with open(test_data, "r") as f:
for line in f:
rev = []
rev.append(line.strip())
if clean_string:
orig_rev = clean_str(" ".join(rev))
else:
orig_rev = " ".join(rev).lower()
words = orig_rev.split()
n_y = int(words[0])
for word in words[1:]:
vocab[word] += 1
datum = {"y":n_y-1,
"text": " ".join(words[1:]),
"num_words": len(words)-1,
"split": 1}
revs.append(datum)
return revs, vocab
defclean_str(string, TREC=False):"""
Tokenization/string cleaning for all datasets except for SST.
Every dataset is lower cased except for TREC
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip() if TREC else string.strip().lower()
defload_bin_vec(fname, vocab):"""
Loads 300x1 word vecs from Google (Mikolov) word2vec
"""
word_vecs = {}
with open(fname, "rb") as f:
header = f.readline()
vocab_size, layer1_size = map(int, header.split())
binary_len = np.dtype('float32').itemsize * layer1_size
for line in xrange(vocab_size):
word = []
whileTrue:
ch = f.read(1)
if ch == ' ':
word = ''.join(word)
breakif ch != '\n':
word.append(ch)
if word in vocab:
word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
else:
f.read(binary_len)
return word_vecs
defadd_unknown_words(word_vecs, vocab, min_df=1, k=EMBEDDING_DIM):"""
For words that occur in at least min_df documents, create a separate word vector.
0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
"""for word in vocab:
if word notin word_vecs and vocab[word] >= min_df:
word_vecs[word] = np.random.uniform(-0.25,0.25,k)
return word_vecs
defget_W(word_vecs, k=EMBEDDING_DIM):"""
Get word matrix. W[i] is the vector for word indexed by i
"""
vocab_size = len(word_vecs)
word_idx_map = dict()
W = np.zeros(shape=(vocab_size+1, k), dtype='float32')
W[0] = np.zeros(k, dtype='float32')
i = 1for word in word_vecs:
W[i] = word_vecs[word]
word_idx_map[word] = i
i += 1return W, word_idx_map
#lod pre-trained word embeddings into an Embedding layer#note that we set trainable = True so as to fine tune the embeddings
embedding_layer = Embedding(len(vocab) + 1,
EMBEDDING_DIM,
weights=[W],
input_length=MAX_SEQUENCE_LENGTH,
trainable=True)
print ("Training model.")
model = Sequential()
model.add(embedding_layer)
model.add(Convolution1D(nb_filter=NB_FILTER,
filter_length=FILTER_LENGTH,
border_mode='valid',
activation='relu',
subsample_length=1))
# use max pooling:
model.add(MaxPooling1D(pool_length=model.output_shape[1]))
# We flatten the output of the conv layer,
model.add(Dropout(0.5))
model.add(Activation('relu'))
# so that we can add a vanilla dense layer:
model.add(Flatten())
# We add a vanilla hidden layer:
model.add(Dense(128, activation='relu'))
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(len(labels[0]), activation='softmax'))
model.summary()