NLP-task5

基于预测的词向量

目前,基于预测的词向量是最流行的,比如word2vec。现在我们来探索word2vec生成的词向量。
  这一部分主要是使用gensim探索词向量,不是自己实现word2vec,所使用的词向量维度是300,由google发布。

# 导入包
import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5

from gensim.models import KeyedVectors  # KeyedVectors:实现实体(单词、文档、图片都可以)和向量之间的映射。每个实体由其字符串id标识。
from gensim.test.utils import datapath
import pprint     #  输出的更加规范易读
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]  #  plt.rcParams主要作用是设置画的图的分辨率,大小等信息
# import nltk
# nltk.download('reuters')    # GitHub下载地址:https://github.com/nltk/nltk_data/tree/gh-pages/packages/corpora
from nltk.corpus import reuters
import numpy as np
import random
import rando
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

START_TOKEN = '<START>'
END_TOKEN = '<END>'

np.random.seed(0)
random.seed(0)

# 导入 "reuters" 语料库
def read_corpus(category="crude"):
    """ Read files from the specified Reuter's category.
        Params:
            category (string): category name
        Return:
            list of lists, with words from each of the processed files
    """
    files = reuters.fileids(category)    # 类别为crude文档
    # 每个文档都转化为小写, 并在开头结尾加标识符
    return [[START_TOKEN] + [w.lower() for w in list(reuters.words(f))] + [END_TOKEN] for f in files]
print()

# 导入语料库的函数,简单的进行了一下预处理,
# 在每句话的前面和后面各加了一个标识符,表示句子的开始和结束,然后把每个单词分开。
# pprint模块格式化打印
# pprint.pprint(object, stream=None, indent=1, width=80, depth=None, *, compact=False)
# width:控制打印显示的宽度。默认为80个字符。注意:当单个对象的长度超过width时,并不会分多行显示,而是会突破规定的宽度。
# compact:默认为False。如果值为False,超过width规定长度的序列会被分散打印到多行。如果为True,会尽量使序列填满width规定的宽度。
reuters_corpus = read_corpus()
pprint.pprint(reuters_corpus[:1], compact=True, width=100)  # compact 设置为False是一行一个单词

# 问题1.1:实现不同单词
# 计算语料库的单词数量、单词集
def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = -1

    # Write your implementation here.
    corpus = [w for sent in corpus for w in sent]
    corpus_words = list(set(corpus))
    corpus_words = sorted(corpus_words)
    num_corpus_words = len(corpus_words)
    # 返回的结果是语料库中的所有单词按照字母顺序排列的。
    return corpus_words, num_corpus_words

# 问题1.2:实现共现矩阵
# 计算给定语料库的共现矩阵。具体来说,对于每一个词 w,统计前、后方 window_size 个词的出现次数\
def compute_co_occurrence_matrix(corpus, window_size=4):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).

        Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
              number of co-occurring words.

              For example, if we take the document "START All that glitters is not gold END" with window size of 4,
              "All" will co-occur with "START", "that", "glitters", "is", and "not".

        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (numpy matrix of shape (number of corpus words, number of corpus words)):
                Co-occurence matrix of word counts.
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    words, num_words = distinct_words(corpus)
    M = None
    word2Ind = {}

    # Write your implementation here.
    M = np.zeros(shape=(num_words, num_words), dtype=np.int32)
    for i in range(num_words):
        word2Ind[words[i]] = i

    for sent in corpus:
        for p in range(len(sent)):
            ci = word2Ind[sent[p]]

            # preceding
            for w in sent[max(0, p - window_size):p]:
                wi = word2Ind[w]
                M[ci][wi] += 1

            # subsequent
            for w in sent[p + 1:p + 1 + window_size]:
                wi = word2Ind[w]
                M[ci][wi] += 1

    return M, word2Ind

# 问题1.3:实现降到k维
# 这一步是降维。
# 在问题1.2得到的是一个N x N的矩阵(N是单词集的大小),使用scikit-learn实现的SVD(奇异值分解),从这个大矩阵里分解出一个含k个特制的N x k 小矩阵。
def reduce_to_k_dim(M, k=2):
    """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words)
        to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:
            - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html

        Params:
            M (numpy matrix of shape (number of corpus words, number of corpus words)): co-occurence matrix of word counts
            k (int): embedding size of each word after dimension reduction
        Return:
            M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings.
                    In terms of the SVD from math class, this actually returns U * S
    """
    n_iters = 10  # Use this parameter in your call to `TruncatedSVD`
    M_reduced = None
    print("Running Truncated SVD over %i words..." % (M.shape[0]))

    # Write your implementation here.
    svd = TruncatedSVD(n_components=k)
    svd.fit(M.T)
    M_reduced = svd.components_.T

    print("Done.")
    return M_reduced

# 问题1.4 实现 plot_embeddings
# 编写一个函数来绘制2D空间中的一组2D矢量。
# 基于matplotlib,用scatter 画 “×”,用 text 写字
def plot_embeddings(M_reduced, word2Ind, words):
    """ Plot in a scatterplot the embeddings of the words specified in the list "words".
        NOTE: do not plot all the words listed in M_reduced / word2Ind.
        Include a label next to each point.

        Params:
            M_reduced (numpy matrix of shape (number of unique words in the corpus , 2)): matrix of 2-dimensioal word embeddings
            word2Ind (dict): dictionary that maps word to indices for matrix M
            words (list of strings): words whose embeddings we want to visualize
    """
    # Write your implementation here.
    fig = plt.figure()
    plt.style.use("seaborn-whitegrid")
    for word in words:
        point = M_reduced[word2Ind[word]]
        plt.scatter(point[0], point[1], marker="^")
        plt.annotate(word, xy=(point[0], point[1]), xytext=(point[0], point[1] + 0.1))

# 测试解决方案图
print("-" * 80)
print("Outputted Plot:")

M_reduced_plot_test = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1], [0, 0]])
word2Ind_plot_test = {'test1': 0, 'test2': 1, 'test3': 2, 'test4': 3, 'test5': 4}
words = ['test1', 'test2', 'test3', 'test4', 'test5']
plot_embeddings(M_reduced_plot_test, word2Ind_plot_test, words)

print("-" * 80)
print()

# 问题1.5:共现打印分析
# 将词嵌入到2个维度上,归一化,最终词向量会落到一个单位圆内,在坐标系上寻找相近的词。
reuters_corpus = read_corpus()
M_co_occurrence, word2Ind_co_occurrence = compute_co_occurrence_matrix(reuters_corpus)
M_reduced_co_occurrence = reduce_to_k_dim(M_co_occurrence, k=2)

# Rescale (normalize) the rows to make them each of unit-length
M_lengths = np.linalg.norm(M_reduced_co_occurrence, axis=1)
M_normalized = M_reduced_co_occurrence / M_lengths[:, np.newaxis] # broadcasting

words = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']

plot_embeddings(M_normalized, word2Ind_co_occurrence, words)
plt.show()

# Part 2:基于预测的词向量
# 使用gensim探索词向量,不是自己实现word2vec,所使用的词向量维度是300,由google发布。
def load_word2vec(embeddings_fp="./GoogleNews-vectors-negative300.bin"):
    """ Load Word2Vec Vectors
        Param:
            embeddings_fp (string) - path to .bin file of pretrained word vectors
        Return:
            wv_from_bin: All 3 million embeddings, each lengh 300
                This is the KeyedVectors format: https://radimrehurek.com/gensim/models/deprecated/keyedvectors.html
    """
    embed_size = 300
    print("Loading 3 million word vectors from file...")
    ## 自己下载的文件
    wv_from_bin = KeyedVectors.load_word2vec_format(embeddings_fp, binary=True)
    vocab = list(wv_from_bin.vocab.keys())
    print("Loaded vocab size %i" % len(vocab))
    return wv_from_bin
wv_from_bin = load_word2vec()
print()

# 首先使用SVD降维,将300维降2维,方便打印查看。
# 问题2.1:word2vec打印分析
# 和问题1.5一样
def get_matrix_of_vectors(wv_from_bin, required_words=['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']):
    """ Put the word2vec vectors into a matrix M.
        Param:
            wv_from_bin: KeyedVectors object; the 3 million word2vec vectors loaded from file
        Return:
            M: numpy matrix shape (num words, 300) containing the vectors
            word2Ind: dictionary mapping each word to its row number in M
    """
    import random
    words = list(wv_from_bin.vocab.keys())
    print("Shuffling words ...")
    random.shuffle(words)
    words = words[:10000]       # 选10000个加入
    print("Putting %i words into word2Ind and matrix M..." % len(words))
    word2Ind = {}
    M = []
    curInd = 0
    for w in words:
        try:
            M.append(wv_from_bin.word_vec(w))
            word2Ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    for w in required_words:
        try:
            M.append(wv_from_bin.word_vec(w))
            word2Ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    M = np.stack(M)
    print("Done.")
    return M, word2Ind

# 测试解决方案图
print("-" * 80)
print("Outputted Plot:")
print("-" * 80)

M, word2Ind = get_matrix_of_vectors(wv_from_bin)
M_reduced = reduce_to_k_dim(M, k=2)         # 减到了2维
plt.tight_layout()
words = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']
plot_embeddings(M_reduced, word2Ind, words)
plt.show()

# 问题2.2:一词多义
# 找到一个有多个含义的词(比如 “leaves”,“scoop”),这种词的top-10相似词(根据余弦相似度)里有两个词的意思不一样。比如"leaves"(叶子,花瓣)的top-10词里有"vanishes"(消失)和"stalks"(茎秆)。
# 这里我找到的词是"column"(列),它的top-10里有"columnist"(专栏作家)和"article"(文章)
w0 = "column"
w0_mean = wv_from_bin.most_similar(w0)
print("column:", w0_mean)
print()

# 问题2.3:近义词和反义词
# 找到三个词(w1, w2, w3),其中w1和w2是近义词,w1和w3是反义词,但是w1和w3的距离<w1和w2的距离。
# 例如:w1=“happy”,w2=“cheerful”,w3=“sad”
w1 = "love"
w2 = "like"
w3 = "hate"
w1_w2_dist = wv_from_bin.distance(w1, w2)
w1_w3_dist = wv_from_bin.distance(w1, w3)
print("Synonyms {}, {} have cosine distance: {}".format(w1, w2, w1_w2_dist))
print("Antonyms {}, {} have cosine distance: {}".format(w1, w3, w1_w3_dist))
print()

# 问题2.4:类比
# man 对于 king,相当于woman对于___,这样的问题也可以用word2vec来解决
# man : him :: woman : her
print("类比 man : him :: woman : her:")
pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'him'], negative=['man']))
print()


# 问题2.5:错误的类比
# 找到一个错误的类比,树:树叶 ::花:花瓣
print("错误的类比 tree : leaf :: flower : petal:")
pprint.pprint(wv_from_bin.most_similar(positive=['leaf', 'flower'], negative=['tree']))
print()

# 问题2.6:偏见分析
# 注意偏见是很重要的比如性别歧视、种族歧视等,执行下面代码,分析两个问题:
# (a) 哪个词与“woman”和“boss”最相似,和“man”最不相似?
# (b) 哪个词与“man”和“boss”最相似,和“woman”最不相似?
print("偏见 woman : boss :: man:")
pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'boss'], negative=['man']))
print()
print("偏见 man : boss :: woman:")
pprint.pprint(wv_from_bin.most_similar(positive=['man', 'boss'], negative=['woman']))
print()

# 问题2.7:自行分析偏见
#     男人:女人 :: 医生:___
#     女人:男人 :: 医生:___
print("自行分析偏见 woman : doctor :: man:")
pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'doctor'], negative=['man']))
print()
print("自行分析偏见 man : doctor :: woman:")
pprint.pprint(wv_from_bin.most_similar(positive=['man', 'doctor'], negative=['woman']))
print()

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值