Part 1: 词向量运算
欢迎来到本周第一个作业。
由于词嵌入的训练计算量庞大切耗费时间长,绝大部分机器学习人员都会导入一个预训练的词嵌入模型。
你将学到:
- 加载预训练单词向量,使用余弦测量相似度
- 使用词嵌入解决类别问题,比如 “Man is to Woman as King is to __”
- 修改文字嵌入以减少他们的性别偏见
导包
import numpy as np
from w2v_utils import *
w2v_utils 中有用的函数
from keras.models import Model
from keras.layers import Input, Dense, Reshape, merge
from keras.layers.embeddings import Embedding
from keras.preprocessing.sequence import skipgrams
from keras.preprocessing import sequence
import urllib.request
import collections
import os
import zipfile
import numpy as np
import tensorflow as tf
window_size = 3
vector_dim = 300
epochs = 1000
valid_size = 16 # Random set of words to evaluate similarity on.
valid_window = 100 # Only pick dev samples in the head of the distribution.
valid_examples = np.random.choice(valid_window, valid_size, replace=False)
def maybe_download(filename, url, expected_bytes):
"""Download a file if not present, and make sure it's the right size."""
if not os.path.exists(filename):
filename, _ = urllib.request.urlretrieve(url + filename, filename)
statinfo = os.stat(filename)
if statinfo.st_size == expected_bytes:
print('Found and verified', filename)
else:
print(statinfo.st_size)
raise Exception(
'Failed to verify ' + filename + '. Can you get to it with a browser?')
return filename
# Read the data into a list of strings.
def read_data(filename):
"""Extract the first file enclosed in a zip file as a list of words."""
with zipfile.ZipFile(filename) as f:
data = tf.compat.as_str(f.read(f.namelist()[0])).split()
return data
def build_dataset(words, n_words):
"""Process raw inputs into a dataset."""
count = [['UNK', -1]]
count.extend(collections.Counter(words).most_common(n_words - 1))
dictionary = dict()
for word, _ in count:
dictionary[word] = len(dictionary)
data = list()
unk_count = 0
for word in words:
if word in dictionary:
index = dictionary[word]
else:
index = 0 # dictionary['UNK']
unk_count += 1
data.append(index)
count[0][1] = unk_count
reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys()))
return data, count, dictionary, reversed_dictionary
def collect_data(vocabulary_size=10000):
url = 'http://mattmahoney.net/dc/'
filename = maybe_download('text8.zip', url, 31344016)
vocabulary = read_data(filename)
print(vocabulary[:7])
data, count, dictionary, reverse_dictionary = build_dataset(vocabulary,
vocabulary_size)
del vocabulary # Hint to reduce memory.
return data, count, dictionary, reverse_dictionary
class SimilarityCallback:
def run_sim(self):
for i in range(valid_size):
valid_word = reverse_dictionary[valid_examples[i]]
top_k = 8 # number of nearest neighbors
sim = self._get_sim(valid_examples[i])
nearest = (-sim).argsort()[1:top_k + 1]
log_str = 'Nearest to %s:' % valid_word
for k in range(top_k):
close_word = reverse_dictionary[nearest[k]]
log_str = '%s %s,' % (log_str, close_word)
print(log_str)
@staticmethod
def _get_sim(valid_word_idx):
sim = np.zeros((vocab_size,))
in_arr1 = np.zeros((1,))
in_arr2 = np.zeros((1,))
in_arr1[0,] = valid_word_idx
for i in range(vocab_size):
in_arr2[0,] = i
out = validation_model.predict_on_batch([in_arr1, in_arr2])
sim[i] = out
return sim
def read_glove_vecs(glove_file):
with open(glove_file, 'r') as f:
words = set()
word_to_vec_map = {}
for line in f:
line = line.strip().split()
curr_word = line[0]
words.add(curr_word)
word_to_vec_map[curr_word] = np.array(line[1:], dtype=np.float64)
return words, word_to_vec_map
def relu(x):
"""
Compute the relu of x
Arguments:
x -- A scalar or numpy array of any size.
Return:
s -- relu(x)
"""
s = np.maximum(0,x)
return s
def initialize_parameters(vocab_size, n_h):
"""
Arguments:
layer_dims -- python array (list) containing the dimensions of each layer in our network
Returns:
parameters -- python dictionary containing your parameters "W1", "b1", "W2", "b2":
W1 -- weight matrix of shape (n_h, vocab_size)
b1 -- bias vector of shape (n_h, 1)
W2 -- weight matrix of shape (vocab_size, n_h)
b2 -- bias vector of shape (vocab_size, 1)
"""
np.random.seed(3)
parameters = {}
parameters['W1'] = np.random.randn(n_h, vocab_size) / np.sqrt(vocab_size)
parameters['b1'] = np.zeros((n_h, 1))
parameters['W2'] = np.random.randn(vocab_size, n_h) / np.sqrt(n_h)
parameters['b2'] = np.zeros((vocab_size, 1))
return parameters
def softmax(x):
"""Compute softmax values for each sets of scores in x."""
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum()
本作业中,我们使用50维的 Glove 向量来表示词。导入数据:
words, word_to_vec_map = read_glove_vecs('data/glove.6B.50d.txt')
其中
- words: 词典中的词集合
- word_to_vec_map: 表示单词到向量映射的map。
one-hot向量不擅长表示向量相似度(内积为0), Glove 向量包含了单词更多的信息,下面看看如何使用 Glove 向量计算相似度。
1 余弦相似度
为了测量两个词的相似程度,需要测量两个词嵌入向量之间的相似程度。 给定两个向量u和v,余弦相似度定义如下:
CosineSimilarity(u, v)=u.v||u||2||v||2=cos(θ) CosineSimilarity(u, v) = u . v | | u | | 2 | | v | | 2 = c o s ( θ )
分子表示两个向量的内积,分母是向量的模的乘积, θ θ 表示向量夹角,向量越近夹角越小,cos 值越大。
练习:实现cosine_similarity()方法来测量向量相似度
谨记:向量的模:向量每项平方加和再开方
# GRADED FUNCTION: cosine_similarity
def cosine_similarity(u, v):
"""
Cosine similarity reflects the degree of similariy between u and v
Arguments:
u -- a word vector of shape (n,)
v -- a word vector of shape (n,)
Returns:
cosine_similarity -- the cosine similarity between u and v defined by the formula above.
"""
distance = 0.0
### START CODE HERE ###
# Compute the dot product between u and v (≈1 line)
dot = np.dot(u, v)
# Compute the L2 norm of u (≈1 line)
norm_u = np.sqrt(np.sum(u**2))
# Compute the L2 norm of v (≈1 line)
norm_v = np.sqrt(np.sum(v**2))
# Compute the cosine similarity defined by formula (1) (≈1 line)
cosine_similarity = dot / (norm_u * norm_v)
### END CODE HERE ###
return cosine_similarity
#############################################
father = word_to_vec_map["father"]
mother = word_to_vec_map["mother"]
ball = word_to_vec_map["ball"]
crocodile = word_to_vec_map["crocodile"]
france = word_to_vec_map["france"]
italy = word_to_vec_map["italy"]
paris = word_to_vec_map["paris"]
rome = word_to_vec_map["rome"]
print("cosine_similarity(father, mother) = ", cosine_similarity(father, mother))
print("cosine_similarity(ball, crocodile) = ",cosine_similarity(ball, crocodile))
print("cosine_similarity(france - paris, rome - italy) = ",cosine_similarity(france - paris, rome - italy))
# cosine_similarity(father, mother) = 0.890903844289
# cosine_similarity(ball, crocodile) = 0.274392462614
# cosine_similarity(france - paris, rome - italy) = -0.675147930817
期待的输出
key | value |
---|---|
cosine_similarity(father, mother) | 0.890903844289 |
cosine_similarity(ball, crocodile) | 0.274392462614 |
cosine_similarity(france - paris, rome - italy) | -0.675147930817 |
2 单词类比推理
类比推理任务中需要实现”a is to b as c is to __” 比如”man is to woman as king is to queen”。我们需要找到单词 d,使得”e_b−e_a ≈ e_d−e_c”
也就是两组的差向量应该相似(仍然用 cos 来衡量)
练习:实现类比推理
# GRADED FUNCTION: complete_analogy
def complete_analogy(word_a, word_b, word_c, word_to_vec_map):
"""
Performs the word analogy task as explained above: a is to b as c is to ____.
Arguments:
word_a -- a word, string
word_b -- a word, string
word_c -- a word, string
word_to_vec_map -- dictionary that maps words to their corresponding vectors.
Returns:
best_word -- the word such that v_b - v_a is close to v_best_word - v_c, as measured by cosine similarity
"""
# convert words to lower case
word_a, word_b, word_c = word_a.lower(), word_b.lower(), word_c.lower()
### START CODE HERE ###
# Get the word embeddings v_a, v_b and v_c (≈1-3 lines)
e_a, e_b, e_c = word_to_vec_map[word_a], word_to_vec_map[word_b], word_to_vec_map[word_c]
### END CODE HERE ###
words = word_to_vec_map.keys()
max_cosine_sim = -100 # Initialize max_cosine_sim to a large negative number
best_word = None # Initialize best_word with None, it will help keep track of the word to output
# loop over the whole word vector set
for w in words:
# to avoid best_word being one of the input words, pass on them.
if w in [word_a, word_b, word_c] :
continue
### START CODE HERE ###
# Compute cosine similarity between the vector (e_b - e_a) and the vector ((w's vector representation) - e_c) (≈1 line)
cosine_sim = cosine_similarity(e_b - e_a, word_to_vec_map[w] - e_c)
# If the cosine_sim is more than the max_cosine_sim seen so far,
# then: set the new max_cosine_sim to the current cosine_sim and the best_word to the current word (≈3 lines)
if cosine_sim > max_cosine_sim:
max_cosine_sim = cosine_sim
best_word = w
### END CODE HERE ###
return best_word
####################################################
triads_to_try = [('italy', 'italian', 'spain'), ('india', 'delhi', 'japan'), ('man', 'woman', 'boy'), ('small', 'smaller', 'large')]
for triad in triads_to_try:
print ('{} -> {} :: {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))
# italy -> italian :: spain -> spanish
# india -> delhi :: japan -> tokyo
# man -> woman :: boy -> girl
# small -> smaller :: large -> larger
期待的输出
key | value |
---|---|
italy -> italian | spain -> spanish |
india -> delhi | japan -> tokyo |
man -> woman | boy -> girl |
small -> smaller | large -> larger |
你可以自己试试:small->smaller as big->?
谨记
- cos 是衡量向量相似度的好方法
- 对于 NLP 应用,使用一个预训练的模型开始工作是一个不错的选择
3 消除词向量偏见 (可选)
在下面的练习中,你将检查词嵌入中的性别偏见,并研究减少偏见的算法。 这部分涉及到一些线性代数,不要害怕,都是比较简单的。
先看看与性别有关的 Glove 词嵌入。 g=ewoman−eman g = e w o m a n − e m a n 那么 g 向量认为就是性别有关的向量。
g = word_to_vec_map['woman'] - word_to_vec_map['man']
print(g)
# [-0.087144 0.2182 -0.40986 -0.03922 -0.1032 0.94165
# -0.06042 0.32988 0.46144 -0.35962 0.31102 -0.86824
# 0.96006 0.01073 0.24337 0.08193 -1.02722 -0.21122
# 0.695044 -0.00222 0.29106 0.5053 -0.099454 0.40445
# 0.30181 0.1355 -0.0606 -0.07131 -0.19245 -0.06115
# -0.3204 0.07165 -0.13337 -0.25068714 -0.14293 -0.224957
# -0.149 0.048882 0.12191 -0.27362 -0.165476 -0.20426