【Keras】word2vec_skipgram

本代码首先将语料文件alice_in_wonderland.txt以句子为单位进行拆分,然后进行序列化(语料下载地址)。

对每个句子提取出3个连续单词组成一个tuple=(left,center,right),skipgram模型(假设词窗大小为3)的目标是从center预测出left、从center预测出right。

因此对于每个tuple=(left,center,right)的数据,整理出的两组数据,如[x,y] = [[x1,y1],[x2,y2]]=[ [center,left],[center,right] ]

from __future__ import print_function
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer, one_hot
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import nltk
import numpy as np
import operator

np.random.seed(2018)

BATCH_SIZE = 128
NUM_EPOCHS = 20

lines = []
fin = open("./data/alice_in_wonderland.txt", "r")
for line in fin:
    line = line.strip()
    if len(line) == 0:
        continue
    lines.append(line)
fin.close()

sents = nltk.sent_tokenize(" ".join(lines)) # 以句子为单位进行划分

tokenizer = Tokenizer(5000)  # use top 5000 words only
tokenizer.fit_on_texts(sents)
vocab_size = len(tokenizer.word_counts) + 1
sequences = tokenizer.texts_to_sequences(sents)
'''
    对每个句子提取出3个连续单词的tuple=(left,center,right),skipgram模型(假设词窗大小为3)的
    目标是从center预测出left、从center预测出right。
    因此对于每个tuple=(left,center,right)的数据,整理出的两组数据,如[x,y] = [[x1,y1],[x2,y2]]=[ [center,left],[center,right] ] 

'''
xs = []
ys = []
for sequence in sequences:
    triples = list(nltk.trigrams(sequence)) # 该句子数字序列中,每3个连续的数字组成一个tuple并返回
    w_lefts = [x[0] for x in triples]  
    w_centers = [x[1] for x in triples]
    w_rights = [x[2] for x in triples]
    xs.extend(w_centers)
    ys.extend(w_lefts)
    xs.extend(w_centers)
    ys.extend(w_rights)
# 将上面已经得到xs,ys转化为 one-hot矩阵
'''
    例如词典大小为 5,有两个待转化为One-hot编码的数字[[2],[4]],则one-hot编码返回一个矩阵为
    [
      [0,0,1,0,0],
      [0,0,0,0,1]
    ]
'''
ohe = OneHotEncoder(n_values=vocab_size)
X = ohe.fit_transform(np.array(xs).reshape(-1, 1)).todense() 
Y = ohe.fit_transform(np.array(ys).reshape(-1, 1)).todense()

# 划分出30%作为测试集,70%作为训练集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3,
                                                random_state=42)
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)

model = Sequential()
model.add(Dense(300, input_shape=(Xtrain.shape[1],)))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(Dense(Ytrain.shape[1]))
model.add(Activation("softmax"))

model.compile(optimizer="Nadam", loss="categorical_crossentropy", 
              metrics=["accuracy"])
history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE, 
                    epochs=15, verbose=1,
                    validation_data=(Xtest, Ytest))
'''
(34487, 2642) (14781, 2642) (34487, 2642) (14781, 2642)
Train on 34487 samples, validate on 14781 samples
Epoch 1/15
34487/34487 [==============================] - 21s 606us/step - loss: 6.1716 - acc: 0.0588 - val_loss: 5.8799 - val_acc: 0.0665
Epoch 2/15
34487/34487 [==============================] - 21s 601us/step - loss: 5.6982 - acc: 0.0897 - val_loss: 5.7263 - val_acc: 0.1024
Epoch 3/15
34487/34487 [==============================] - 20s 584us/step - loss: 5.4394 - acc: 0.1207 - val_loss: 5.6298 - val_acc: 0.1187
Epoch 4/15
34487/34487 [==============================] - 20s 588us/step - loss: 5.2074 - acc: 0.1415 - val_loss: 5.5756 - val_acc: 0.1223
Epoch 5/15
34487/34487 [==============================] - 20s 590us/step - loss: 4.9883 - acc: 0.1526 - val_loss: 5.5623 - val_acc: 0.1207
Epoch 6/15
34487/34487 [==============================] - 20s 591us/step - loss: 4.7904 - acc: 0.1606 - val_loss: 5.5681 - val_acc: 0.1238
Epoch 7/15
34487/34487 [==============================] - 20s 588us/step - loss: 4.6211 - acc: 0.1673 - val_loss: 5.5959 - val_acc: 0.1231
Epoch 8/15
34487/34487 [==============================] - 20s 590us/step - loss: 4.4720 - acc: 0.1675 - val_loss: 5.6278 - val_acc: 0.1217
Epoch 9/15
34487/34487 [==============================] - 20s 591us/step - loss: 4.3344 - acc: 0.1686 - val_loss: 5.6912 - val_acc: 0.1202
Epoch 10/15
34487/34487 [==============================] - 20s 587us/step - loss: 4.2347 - acc: 0.1664 - val_loss: 5.7420 - val_acc: 0.1187
Epoch 11/15
34487/34487 [==============================] - 20s 590us/step - loss: 4.1467 - acc: 0.1676 - val_loss: 5.8011 - val_acc: 0.1223
Epoch 12/15
34487/34487 [==============================] - 21s 601us/step - loss: 4.0760 - acc: 0.1665 - val_loss: 5.8599 - val_acc: 0.1216
Epoch 13/15
34487/34487 [==============================] - 21s 597us/step - loss: 4.0243 - acc: 0.1656 - val_loss: 5.9128 - val_acc: 0.1178
Epoch 14/15
34487/34487 [==============================] - 21s 607us/step - loss: 3.9790 - acc: 0.1655 - val_loss: 5.9498 - val_acc: 0.1214
Epoch 15/15
34487/34487 [==============================] - 21s 617us/step - loss: 3.9430 - acc: 0.1673 - val_loss: 6.0056 - val_acc: 0.1194
'''
# plot loss function
plt.subplot(211)
plt.title("accuracy")
plt.plot(history.history["acc"], color="r", label="train")
plt.plot(history.history["val_acc"], color="b", label="validation")
plt.legend(loc="best")

plt.subplot(212)
plt.title("loss")
plt.plot(history.history["loss"], color="r", label="train")
plt.plot(history.history["val_loss"], color="b", label="validation")
plt.legend(loc="best")

plt.tight_layout()
plt.show()

# evaluate model
score = model.evaluate(Xtest, Ytest, verbose=1)
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1]))

这里写图片描述

''' 
14781/14781 [==============================] - 3s 177us/step
Test score: 6.006, accuracy: 0.119
'''
# using the word2vec model
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}

# retrieve the weights from the first dense layer. This will convert
# the input vector from a one-hot sum of two words to a dense 300 
# dimensional representation
W, b = model.layers[0].get_weights()

# 计算词典所有单词的词向量
idx2emb = {}    
for word in word2idx.keys():
    wid = word2idx[word]
    vec_in = ohe.fit_transform(np.array(wid)).todense()
    vec_emb = np.dot(vec_in, W)
    idx2emb[wid] = vec_emb

# 找出与word的词向量余弦相似度最高的10个单词,并输出这些单词
for word in ["stupid", "wonderful", "succeeded"]:
    wid = word2idx[word]
    source_emb = idx2emb[wid]
    distances = []
    for i in range(1, vocab_size):
        if i == wid:
            continue
        target_emb = idx2emb[i]
        distances.append(
            (
             (wid, i),
             cosine_distances(source_emb, target_emb)
            )
        )
    sorted_distances = sorted(distances, key=operator.itemgetter(1))[0:10]
    predictions = [idx2word[x[0][1]] for x in sorted_distances]
    print("{:s} => {:s}".format(word, ", ".join(predictions)))
'''
stupid => holding, derision, northumbria, music, spot, waist, fighting, swallowing, pardoned, axes
wonderful => red, mile, lark, rat, tunnel, grammar, piteous, commotion, saves, poured
succeeded => grazed, sends, stool, upstairs, search, growled, harm, rustling, heels, spite
'''
  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值