一、代码实现原理解析
目录一已经讲过代码的实现原理i,下面针对一个具体的Twitter情感分类的实验来具体讲解CNN对短文本的处理过程。首先是带有极性的Twitter文本作为原输入,这里假设是二分类的问题,那么将具有相同极性的twitter分到同一个文件下,这样会有两个文件,一个是positive,另一个是negative。每个文件里面都是一条条的twitter文本。然后使用TwitterNLP来初步对twitter处理。直接调用TwitterNLP的API即可。然后以一定格式保存处理后的twitter。这里所说的一定格式是因为后面方便处理。下面的代码只是片段,方便理解,完整的代码可以下载。
f = open('dev_tweets_pos.txt')
pos_line = f.readlines()
f.close()
f = open('dev_tweets_neg.txt')
neg_line = f.readlines()
f.close()
f_pos = open('rt-polarity_dev_pos.txt', 'wb+')
f_neg = open('rt-polarity_dev_neg.txt', 'wb+')
for item in pos_line:
s = item.split('\n')
word_list = tokenize(s[0])
for i in range(len(word_list)-1):
f_pos.write(word_list[i].lower() + ' ')
f_pos.write(word_list[len(word_list) - 1] + '\n')
for item in neg_line:
s = item.split('\n')
word_list = tokenize(s[0])
for i in range(len(word_list)-1):
f_neg.write(word_list[i].lower() + ' ')
f_neg.write(word_list[len(word_list)-1] + '\n')
利用TwitterAPI处理完后,将使用Yoon Kim的Process_data.py代码进一步的处理。
1、这一段代码是读取用Twitter处理之后的文本,统计文本中的单词,并将统计存入词典中,key:word,value:word_number。并从中选择10%的数据作为dev数据。因为后来采用十折交叉验证来搭建CNN训练模型。
def build_data_cv(data_folder, cv=10, clean_string=True):
"""
Loads data and split into 10 folds.
"""
revs = []
pos_file = data_folder[0]
neg_file = data_folder[1]
vocab = defaultdict(float)
with open(pos_file, "rb") as f:
for line in f:
rev = []
rev.append(line.strip())
if clean_string:
orig_rev = clean_str(" ".join(rev))
else:
orig_rev = " ".join(rev).lower()
words = set(orig_rev.split())
for word in words:
vocab[word] += 1
datum = {"y":1,
"text": orig_rev,
"num_words": len(orig_rev.split()),
"split": np.random.randint(0,cv)}
revs.append(datum)
with open(neg_file, "rb") as f:
for line in f:
rev = []
rev.append(line.strip())
if clean_string:
orig_rev = clean_str(" ".join(rev))
else:
orig_rev = " ".join(rev).lower()
words = set(orig_rev.split())
for word in words:
vocab[word] += 1
datum = {"y":0,
"text": orig_rev,
"num_words": len(orig_rev.split()),
"split": np.random.randint(0,cv)}
revs.append(datum)
return revs, vocab
2、加载预训练的词向量表。这里采用的是Word2vec,可以修改成其他的词向量表。
def load_bin_vec(fname, vocab):
"""
Loads 300x1 word vecs from Google (Mikolov) word2vec
"""
word_vecs = {}
with open(fname, "rb") as f:
header = f.readline()
vocab_size, layer1_size = map(int, header.split())
binary_len = np.dtype('float32').itemsize * layer1_size
for line in xrange(vocab_size):
word = []
while True:
ch = f.read(1)
if ch == ' ':
word = ''.join(word)
break
if ch != '\n':
word.append(ch)
if word in vocab:
word_vecs[word] = np.fromstring(f.read(binary_len), dtype='float32')
3、对于不在Word2vec中的单词采用随机均匀分布的策略赋值。
def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
"""
For words that occur in at least min_df documents, create a separate word vector.
0.25 is chosen so the unknown vectors have (approximately) same variance as pre-trained ones
"""
for word in vocab:
if word not in word_vecs and vocab[word] >= min_df:
word_vecs[word] = np.random.uniform(-0.25,0.25,k)
4、生成词向量矩阵
def get_W(word_vecs, k=300):
"""
Get word matrix. W[i] is the vector for word indexed by i
"""
vocab_size = len(word_vecs)
word_idx_map = dict()
W = np.zeros(shape=(vocab_size+1, k), dtype='float32')
W[0] = np.zeros(k, dtype='float32')
i = 1
for word in word_vecs:
W[i] = word_vecs[word]
word_idx_map[word] = i
i += 1
return W, word_idx_map