Course 5 - 序列模型 - 第二周作业 - 词向量的运算与Emoji生成器

最新推荐文章于 2023-11-14 10:34:48 发布

晴天下雨下雪下冰雹

最新推荐文章于 2023-11-14 10:34:48 发布

阅读量424

点赞数

分类专栏：深度学习与神经网络

本文链接：https://blog.csdn.net/qq_24599703/article/details/90272502

版权

深度学习与神经网络专栏收录该内容

14 篇文章 3 订阅

订阅专栏

1 - 词向量运算

如何加载训练好了的词向量
使用余弦相似性计算相似度

3.使用词嵌入来解决“男人与女人相比就像国王与____ 相比”之类的词语类比问题

4.修改词嵌入以减少性别偏见等

整体代码如下：

#!/usr/bin/env python
# _*_ coding:utf-8 _*_

import numpy as np
import w2v_utils

# 1.加载词向量
words,word_to_vec_map=w2v_utils.read_glove_vecs("data/glove.6B.50d.txt")
# 查看词语保存的向量
# print(word_to_vec_map["hello"])

# 1.1定义余弦相似度---越相似值越大
def cosine_similarity(u,v):
    '''
    u和v余弦相似度反应了u和v的相似程度
    :param u: 维度(n,)的词向量
    :param v: 维度(n,)的词向量
    :return: u*v(点乘)/||u||2 * ||v||2(向量的2范式)
    '''
    distance=0

    # 计算U,V的内积
    dot=np.dot(u,v)
    # 计算u的2范数
    norm_u=np.sqrt(np.sum(np.power(u,2)))
    # 计算v的2范数
    norm_v=np.sqrt(np.sum(np.power(v,2)))
    # 计算余弦相似度
    cosine_similarity=np.divide(dot,norm_u*norm_v)
    return cosine_similarity
# 测试
# father = word_to_vec_map["father"]
# mother = word_to_vec_map["mother"]
# ball = word_to_vec_map["ball"]
# crocodile = word_to_vec_map["crocodile"]
# france = word_to_vec_map["france"]
# italy = word_to_vec_map["italy"]
# paris = word_to_vec_map["paris"]
# rome = word_to_vec_map["rome"]
#
# print("cosine_similarity(father, mother) = ", cosine_similarity(father, mother))
# print("cosine_similarity(ball, crocodile) = ",cosine_similarity(ball, crocodile))
# print("cosine_similarity(france - paris, rome - italy) = ",cosine_similarity(france - paris, rome - italy))

# 1.2词类类比
def complete_analogy(word_a,word_b,word_c,word_to_vec_map):
    '''
     解决“A与B相比就类似于C与____相比一样”之类的问题
    :param word_a:一个字符串类型的词
    :param word_b: 一个字符串类型的词
    :param word_c: 一个字符串类型的词
    :param word_to_vec_map:字典类型，单词到GloVe向量的映射
    :return:best_word -- 满足(v_b - v_a) 最接近 (v_best_word - v_c) 的词
    '''
    # 单词转换为小写
    word_a,word_b,word_c=word_a.lower(),word_b.lower(),word_c.lower()
    # 获取对应的词向量
    e_a,e_b,e_c=word_to_vec_map[word_a],word_to_vec_map[word_b],word_to_vec_map[word_c]
    # 获取全部单词
    words=word_to_vec_map.keys()

    # 初始化
    max_consine_sim=-100
    best_word=None

    # 遍历整个数据集
    for word in words:
        if word in [word_a,word_c,word_b]:
            continue
        # 计算余弦相似度----越相似越大
        consine_sim=cosine_similarity((e_b-e_a),(word_to_vec_map[word]-e_c))

        if consine_sim>max_consine_sim:
            max_consine_sim=consine_sim
            best_word=word
    return best_word
# 测试
# triads_to_try = [('italy', 'italian', 'spain'), ('india', 'delhi', 'japan'), ('man', 'woman', 'boy'), ('small', 'smaller', 'large')]
# for triad in triads_to_try:
#     print('{} -> {} <====> {} -> {}'.format( *triad, complete_analogy(*triad,word_to_vec_map)))

# 1.3去除词向量中的偏见

# 包含在词嵌入中的性别偏差
g = word_to_vec_map['woman'] - word_to_vec_map['man']
# print(g)
# # 不同单词与g的余弦相似度，考虑相似度的正值与余弦相似度的负值之间的关系
# # 即与性别偏差轴的余弦相似度
# word_list = ['lipstick', 'guns', 'science', 'arts', 'literature', 'warrior','doctor', 'tree', 'receptionist',
#              'technology',  'fashion', 'teacher', 'engineer', 'pilot', 'computer', 'singer']
# for w in word_list:
#     print (w, cosine_similarity(word_to_vec_map[w], g))
# 执行结果
# lipstick 0.276919162564
# guns -0.18884855679
# science -0.0608290654093
# arts 0.00818931238588
# literature 0.0647250443346
# warrior -0.209201646411
# doctor 0.118952894109 偏向于女性
# tree -0.0708939917548
# receptionist 0.330779417506
# technology -0.131937324476
# fashion 0.0356389462577
# teacher 0.179209234318
# engineer -0.0803928049452
# pilot 0.00107644989919
# computer -0.103303588739偏向于男性
# singer 0.185005181365

# 1.3.1 消除与性别无关的词汇的偏差
# e_bias_component=e和g两个向量的点乘除以g的二范数的平方，再乘以向量g，可得到是e在g方向上的投影
# e_debiased=e-e_bias_component=向量相减求取垂直于g方向的向量
def neutralize(word,g,word_to_vec_map):
    '''
    通过将“word”投影到与偏置轴正交的空间上，消除了“word”的偏差。
    该函数确保“word”在性别的子空间中的值为0
    :param word: 待消除偏差的字符串
    :param g:维度为(50,)，对应于偏置轴（如性别）
    :param word_to_vec_map:字典类型，单词到GloVe向量的映射
    :return:e_debiased -- 消除了偏差的向量。
    '''
    # 根据word选择对应的词向量
    e=word_to_vec_map[word]

    # 计算e_bias_component
    e_bias_component=np.divide(np.dot(e,g),np.square(np.linalg.norm(g)))*g
    # 计算e_debiased
    e_debiased=e-e_bias_component
    return e_debiased
# 测试
# e = "receptionist"
# print("去偏差前{0}与g的余弦相似度为：{1}".format(e, cosine_similarity(word_to_vec_map["receptionist"], g)))
#
# e_debiased = neutralize("receptionist", g, word_to_vec_map)
# print("去偏差后{0}与g的余弦相似度为：{1}".format(e, cosine_similarity(e_debiased, g)))


# 1.3.2性别词的均衡算法--将如act actress关于垂直于g为的向量是对称的
# 根据均衡公式进行均衡，具体公式https://blog.csdn.net/u013733326/article/details/83341643#t0
def equalize(pair,bias_axis,word_to_vec_map):
    '''
    通过遵循上图中所描述的均衡方法来消除性别偏差。
    :param pair: 要消除性别偏差的词组，比如 ("actress", "actor")
    :param bias_axis:维度为(50,)，对应于偏置轴（如性别）
    :param word_to_vec_map:字典类型，单词到GloVe向量的映射
    :return:
     e_1 -- 第一个词的词向量
     e_2 -- 第二个词的词向量
    '''
    # 第一步：获取词向量
    w1,w2=pair
    e_w1,e_w2=word_to_vec_map[w1],word_to_vec_map[w2]

    # 第二步：计算均值
    mu=(e_w1+e_w2)/2.0

    # 第三步：计算Mu在偏执轴与正交轴的投影
    mub=np.divide(np.dot(mu,bias_axis),np.square(np.linalg.norm(bias_axis)))*bias_axis
    mu_orth=mu-mub

    # 第四步：使用公式7、8计算e_w1B 与 e_w2B
    e_w1B=np.divide(np.dot(e_w1,bias_axis),np.square(np.linalg.norm(bias_axis)))*bias_axis
    e_w2B=np.divide(np.dot(e_w2,bias_axis),np.square(np.linalg.norm(bias_axis)))*bias_axis

    # 第五步：：根据公式9、10调整e_w1B 与 e_w2B的偏置部分
    corrected_e_w1B=np.sqrt(np.abs(1-np.square(np.linalg.norm(mu_orth))))*np.divide(e_w1B-mub,np.abs(e_w1-mu_orth-mub))
    corrected_e_w2B=np.sqrt(np.abs(1-np.square(np.linalg.norm(mu_orth))))*np.divide(e_w2B-mub,np.abs(e_w2-mu_orth-mub))

    # 第六步：使e1和e2等于它们修正后的投影之和，从而消除偏差
    e1=corrected_e_w1B+mu_orth
    e2=corrected_e_w2B+mu_orth

    return e1,e2
# 测试
print("==========均衡校正前==========")
print("cosine_similarity(word_to_vec_map[\"man\"], gender) = ", cosine_similarity(word_to_vec_map["man"], g))
print("cosine_similarity(word_to_vec_map[\"woman\"], gender) = ", cosine_similarity(word_to_vec_map["woman"], g))
e1, e2 = equalize(("man", "woman"), g, word_to_vec_map)
print("\n==========均衡校正后==========")
# 矫正后绝对值相近
print("cosine_similarity(e1, gender) = ", cosine_similarity(e1, g))
print("cosine_similarity(e2, gender) = ", cosine_similarity(e2, g))

2 - Emoji表情生成器

2.1简单模型的生成器，

简单的前向传播，不考虑词序

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
import numpy as np
import emo_utils
import pandas as pd
X_train,Y_train=emo_utils.read_csv('data/train_emoji.csv')
X_test,Y_test=emo_utils.read_csv('data/test.csv')

maxLen=len(max(X_train,key=len).split())

# 查看内容
# index = 3
# print(X_train[index], emo_utils.label_to_emoji(Y_train[index]))

# 将我们的标签Y YY转换成softmax分类器所需要的格式，即从(m,1) (m, 1)(m,1)转换为独热编码
# Y_oh_train=emo_utils.convert_to_one_hot(Y_train,C=5)
# Y_oh_test=emo_utils.convert_to_one_hot(Y_test,C=5)
# 查看独热编码
# index = 0
# print("{0}对应的独热编码是{1}".format(Y_train[index], Y_oh_train[index]))

# 加载词向量
# word_to_index：字典类型的词汇（400,001个）与索引的映射（有效范围：0-400,000）
# index_to_word：字典类型的索引与词汇之间的映射。
word_to_index, index_to_word, word_to_vec_map = emo_utils.read_glove_vecs('data/glove.6B.50d.txt')
# 测试
# word = "cucumber"
# index = 113317
# print("单词{0}对应的索引是：{1}".format(word, word_to_index[word]))
# print("索引{0}对应的单词是：{1}".format(index, index_to_word[index]))

# 将句子转为单词后转为向量求平均
def sentence_to_avg(sentence,word_to_vec_map):
    '''
    将句子转换为单词列表，提取其GloVe向量，然后将其平均。
    :param sentence:字符串类型，从X中获取的样本。
    :param word_to_vec_map: 字典类型，单词映射到50维的向量的字典
    :return:avg: 对句子的均值编码，维度为(50,)
    '''
    # 第一步：分割句子，转为列表
    words=sentence.lower().split()

    # 第二步：初始化均值词向量
    avg=np.zeros(50,)

    # 第三步：求平均
    for w in words:
        avg+=word_to_vec_map[w]
    avg=np.divide(avg,len(words))
    return avg
# 测试
# avg = sentence_to_avg("Morrocan couscous is my favorite dish", word_to_vec_map)
# print("avg = ", avg)

# 实现模型结构，向前转播，经过softmax，然后再计算损失函数，反向传播，更新参数
def model(X,Y,word_to_vec_map,learning_rate=0.01,num_iterations=400):
    '''
    在numpy中训练词向量模型
    :param X: 输入的字符串类型的数据，维度为(m, 1)。
    :param Y:对应的标签，0-7的数组，维度为(m, 1)。
    :param word_to_vec_map:字典类型的单词到50维词向量的映射。
    :param learning_rate:学习率
    :param num_iterations:迭代次数。
    :return:
     pred -- 预测的向量，维度为(m, 1)。
        W -- 权重参数，维度为(n_y, n_h)。
        b -- 偏置参数，维度为(n_y,)
        wx=y
        (5,50)(50,m)=(5,m)
    '''
    np.random.seed(1)
    # 定义训练数量
    m=Y.shape[0]
    n_y=5
    n_h=50

    # 使用Xavier初始化参数
    W=np.random.randn(n_y,n_h)/np.sqrt(n_h)
    b=np.zeros((n_y,))

    # 将Y换成独热编码
    Y_oh=emo_utils.convert_to_one_hot(Y,C=n_y)

    # 优化循环
    for t in range(num_iterations):
        for i in range(m):
            # 获取第i个样本的均值
            avg=sentence_to_avg(X[i],word_to_vec_map)

            # 前向传播
            z=np.dot(W,avg)+b
            a=emo_utils.softmax(z)

            # 计算第i个训练的损失
            cost=-np.sum(Y_oh[i]*np.log(a))

            # 计算梯度
            dz=a-Y_oh[i]
            dW=np.dot(dz.reshape(n_y,1),avg.reshape(1,n_h))
            db=dz

            # 更新参数
            W=W-learning_rate*dW
            b=b-learning_rate*db
        if t%100==0:
            print("第{t}轮，损失为{cost}".format(t=t, cost=cost))
            pred=emo_utils.predict(X,Y,W,b,word_to_vec_map)
    return pred,W,b
# 测试
# print(X_train.shape)
# print(Y_train.shape)
# print(np.eye(5)[Y_train.reshape(-1)].shape)
# print(X_train[0])
# print(type(X_train))
# Y = np.asarray([5,0,0,5, 4, 4, 4, 6, 6, 4, 1, 1, 5, 6, 6, 3, 6, 3, 4, 4])
# print(Y.shape)

# X = np.asarray(['I am going to the bar tonight', 'I love you', 'miss you my dear',
#  'Lets go party and drinks','Congrats on the new job','Congratulations',
#  'I am so happy for you', 'Why are you feeling bad', 'What is wrong with you',
#  'You totally deserve this prize', 'Let us go play football',
#  'Are you down for football this afternoon', 'Work hard play harder',
#  'It is suprising how people can be dumb sometimes',
#  'I am very disappointed','It is the best day in my life',
#  'I think I will end up alone','My life is so boring','Good job',
#  'Great so awesome'])

# 训练阶段
pred, W, b =model(X_train,Y_train,word_to_vec_map)

# 测试阶段
print("=====训练集====")
pred_train = emo_utils.predict(X_train, Y_train, W, b, word_to_vec_map)
print("=====测试集====")
pred_test = emo_utils.predict(X_test, Y_test, W, b, word_to_vec_map)

# X_my_sentences = np.array(["i adore you", "i love you", "funny lol", "lets play with a ball", "food is ready", "you are not happy"])
# Y_my_labels = np.array([[0], [0], [2], [1], [4],[3]])
#
# pred = emo_utils.predict(X_my_sentences, Y_my_labels , W, b, word_to_vec_map)
# emo_utils.print_predictions(X_my_sentences, pred)

# you are not happy ❤没有考虑词序

print(" \t {0} \t {1} \t {2} \t {3} \t {4}".format(emo_utils.label_to_emoji(0), emo_utils.label_to_emoji(1), \
                                                 emo_utils.label_to_emoji(2), emo_utils.label_to_emoji(3), \
                                                 emo_utils.label_to_emoji(4)))

print(pd.crosstab(Y_test, pred_test.reshape(56,), rownames=['Actual'], colnames=['Predicted'], margins=True))
emo_utils.plot_confusion_matrix(Y_test, pred_test)

2.2LSTM，考虑词序

其中经过各层之后的维度如下：

#!/usr/bin/env python
# _*_ coding:utf-8 _*_
import numpy as np
import emo_utils
np.random.seed(0)
import keras
from keras.models import Model
from keras.layers import Dense, Input, Dropout, LSTM, Activation
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

np.random.seed(1)
from keras.initializers import glorot_uniform
# 使用LSTM模块根据词序判断情感
word_to_index, index_to_word, word_to_vec_map = emo_utils.read_glove_vecs('data/glove.6B.50d.txt')
X_train,Y_train=emo_utils.read_csv('data/train_emoji.csv')
X_test,Y_test=emo_utils.read_csv('data/test.csv')
maxLen=len(max(X_train,key=len).split())
# 1.数据预处理-将X转为Embedding()函数接受的列表或矩阵
def sentences_to_indices(X,word_to_index,max_len):
    '''
     输入的是X（字符串类型的句子的数组），再转化为对应的句子列表，
    输出的是能够让Embedding()函数接受的列表或矩阵。
    :param X: 句子数组，维度为(m, 1)
    :param word_to_index:字典类型的单词到索引的映射
    :param maxlen:最大句子的长度，数据集中所有的句子的长度都不会超过它
    :return: X_indices -- 对应于X中的单词索引数组，维度为(m, max_len)
    '''
    # 训练集数量
    m=X.shape[0]
    # 使用0初始化X_indices
    X_indices=np.zeros((m,max_len))

    for i in range(m):
        # 将第i个句子转化为小写并按单词分开。
        sentences_words = X[i].lower().split()
        # 初始化j为0
        j = 0

        # 遍历这个单词列表
        for w in sentences_words:
            # 将X_indices的第(i, j)号元素为对应的单词索引
            X_indices[i, j] = word_to_index[w]

            j += 1
    return X_indices
# 测试
# X1 = np.array(["funny lol", "lets play baseball", "food is ready for you"])
# X1_indices = sentences_to_indices(X1,word_to_index, max_len = 5)
# print("X1 =", X1)
# print("X1_indices =", X1_indices)

# 2.定义embedding层
def pretrained_embedding_layer(word_to_vec_map,word_to_index):
    '''
    创建Keras Embedding()层，加载已经训练好了的50维GloVe向量
    :param word_to_vec_map:字典类型的单词与词嵌入的映射
    :param word_to_index: 字典类型的单词到词汇表（400,001个单词）的索引的映射。
    :return:embedding_layer() -- 训练好了的Keras的实体层
    '''
    vocab_len=len(word_to_index)+1
    emb_dim=word_to_vec_map["cucumber"].shape[0]

    # 初始化嵌入矩阵
    emb_matrix=np.zeros((vocab_len,emb_dim))

    # 将嵌入矩阵的每行设置为词汇的词向量表示
    for word,index in word_to_index.items():
        emb_matrix[index,:]=word_to_vec_map[word]

    # 定义Keras的embeding层
    # 当调用Embedding()的时候需要让这一层的参数不能被训练，所以我们可以设置trainable=False。
    embedding_layer=Embedding(vocab_len,emb_dim,trainable=False)

    # 构建embedding层(inputshape=)
    embedding_layer.build((None,))

    # 将嵌入层的权重设置为嵌入矩阵 ,矩阵(vocab_len,emb_dim)
    embedding_layer.set_weights([emb_matrix])

    return embedding_layer
# 测试
# embedding_layer = pretrained_embedding_layer(word_to_vec_map, word_to_index)
# print("weights[0][1][3] =", embedding_layer.get_weights()[0][1][3])

# 3.构建模型
def Emojify_V2(input_shape,word_to_vec_map,word_to_index):
    '''
     实现Emojify-V2模型的计算图
    :param imput_shape:输入的维度，通常是(max_len,)
    :param word_to_vec_map: 字典类型的单词与词嵌入的映射。
    :param word_to_index:字典类型的单词到词汇表（400,001个单词）的索引的映射
    :return: model -- Keras模型实体
    '''
    # 定义sentence_indices为计算图的输入，维度为(input_shape,),类型为int32
    sentence_indices=Input(input_shape,dtype='int32')

    # 创建embedding层
    embedding_layer=pretrained_embedding_layer(word_to_vec_map,word_to_index)

    # 通过嵌入层传播sentence_indices，你会得到嵌入的结果
    embeddings=embedding_layer(sentence_indices)

    # 通过带有128为隐藏状态的LSTM层传播嵌入--128个units
    # 需要注意的是，返回的输出应该是一批序列。
    X=LSTM(128,return_sequences=True)(embeddings)
    # 使用dropout，概率为0.5
    X=Dropout(0.5)(X)
    # 通过另一个128为的隐藏状态的LSTM层转播X
    # 注意，返回的输出应该是单个隐藏状态，而不是一组序列。
    X=LSTM(128,return_sequences=False)(X)
    #使用dropout层概率为0.5
    X=Dropout(0.5)(X)
    # 通过Dense层转播X，得到一批5维向量
    X=Dense(5)(X)
    # 添加softmax激活
    X=Activation('softmax')(X)

    # 创建模型实体
    model=Model(inputs=sentence_indices,outputs=X)

    return model
model = Emojify_V2((maxLen,), word_to_vec_map, word_to_index)
# model.summary()
# 编译模型
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

X_train_indices = sentences_to_indices(X_train, word_to_index, maxLen)
Y_train_oh = emo_utils.convert_to_one_hot(Y_train, C = 5)
# 训练模型
model.fit(X_train_indices, Y_train_oh, epochs = 50, batch_size = 32, shuffle=True)

# 在测试集的表现
X_test_indices = sentences_to_indices(X_test, word_to_index, max_len = maxLen)
Y_test_oh = emo_utils.convert_to_one_hot(Y_test, C = 5)
loss, acc = model.evaluate(X_test_indices, Y_test_oh)

print("Test accuracy = ", acc)
# 查看分错的语句
C = 5
pred = model.predict(X_test_indices)
for i in range(len(X_test)):
    x = X_test_indices
    num = np.argmax(pred[i])
    if(num != Y_test[i]):
        print('正确表情：'+ emo_utils.label_to_emoji(Y_test[i]) + '   预测结果： '+ X_test[i] + emo_utils.label_to_emoji(num).strip())
# 使用自己的话
x_test = np.array(['you are so beautiful'])
X_test_indices = sentences_to_indices(x_test, word_to_index, maxLen)
print(x_test[0] +' '+  emo_utils.label_to_emoji(np.argmax(model.predict(X_test_indices))))