NLP-task5

最新推荐文章于 2020-12-24 01:35:11 发布
Alhfors
最新推荐文章于 2020-12-24 01:35:11 发布
阅读量217
点赞数
本文链接：https://blog.csdn.net/Alhfors/article/details/107170215
版权
基于预测的词向量
目前，基于预测的词向量是最流行的，比如word2vec。现在我们来探索word2vec生成的词向量。
这一部分主要是使用gensim探索词向量，不是自己实现word2vec，所使用的词向量维度是300，由google发布。
# 导入包
import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5

from gensim.models import KeyedVectors  # KeyedVectors:实现实体（单词、文档、图片都可以）和向量之间的映射。每个实体由其字符串id标识。
from gensim.test.utils import datapath
import pprint     #  输出的更加规范易读
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]  #  plt.rcParams主要作用是设置画的图的分辨率，大小等信息
# import nltk
# nltk.download('reuters')    # GitHub下载地址：https://github.com/nltk/nltk_data/tree/gh-pages/packages/corpora
from nltk.corpus import reuters
import numpy as np
import random
import rando
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

START_TOKEN = '<START>'
END_TOKEN = '<END>'

np.random.seed(0)
random.seed(0)

# 导入 "reuters" 语料库
def read_corpus(category="crude"):
    """ Read files from the specified Reuter's category.
        Params:
            category (string): category name
        Return:
            list of lists, with words from each of the processed files
    """
    files = reuters.fileids(category)    # 类别为crude文档
    # 每个文档都转化为小写， 并在开头结尾加标识符
    return [[START_TOKEN] + [w.lower() for w in list(reuters.words(f))] + [END_TOKEN] for f in files]
print()

# 导入语料库的函数，简单的进行了一下预处理，
# 在每句话的前面和后面各加了一个标识符，表示句子的开始和结束，然后把每个单词分开。
# pprint模块格式化打印
# pprint.pprint(object, stream=None, indent=1, width=80, depth=None, *, compact=False)
# width：控制打印显示的宽度。默认为80个字符。注意：当单个对象的长度超过width时，并不会分多行显示，而是会突破规定的宽度。
# compact：默认为False。如果值为False，超过width规定长度的序列会被分散打印到多行。如果为True，会尽量使序列填满width规定的宽度。
reuters_corpus = read_corpus()
pprint.pprint(reuters_corpus[:1], compact=True, width=100)  # compact 设置为False是一行一个单词

# 问题1.1：实现不同单词
# 计算语料库的单词数量、单词集
def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): list of distinct words across the corpus, sorted (using python 'sorted' function)
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = -1

    # Write your implementation here.
    corpus = [w for sent in corpus for w in sent]
    corpus_words = list(set(corpus))
    corpus_words = sorted(corpus_words)
    num_corpus_words = len(corpus_words)
    # 返回的结果是语料库中的所有单词按照字母顺序排列的。
    return corpus_words, num_corpus_words

# 问题1.2：实现共现矩阵
# 计算给定语料库的共现矩阵。具体来说，对于每一个词 w，统计前、后方 window_size 个词的出现次数\
def compute_co_occurrence_matrix(corpus, window_size=4):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).

        Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
              number of co-occurring words.

              For example, if we take the document "START All that glitters is not gold END" with window size of 4,
              "All" will co-occur with "START", "that", "glitters", "is", and "not".

        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (numpy matrix of shape (number of corpus words, number of corpus words)):
                Co-occurence matrix of word counts.
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2Ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    words, num_words = distinct_words(corpus)
    M = None
    word2Ind = {}

    # Write your implementation here.
    M = np.zeros(shape=(num_words, num_words), dtype=np.int32)
    for i in range(num_words):
        word2Ind[words[i]] = i

    for sent in corpus:
        for p in range(len(sent)):
            ci = word2Ind[sent[p]]

            # preceding
            for w in sent[max(0, p - window_size):p]:
                wi = word2Ind[w]
                M[ci][wi] += 1

            # subsequent
            for w in sent[p + 1:p + 1 + window_size]:
                wi = word2Ind[w]
                M[ci][wi] += 1

    return M, word2Ind

# 问题1.3：实现降到k维
# 这一步是降维。
# 在问题1.2得到的是一个N x N的矩阵（N是单词集的大小），使用scikit-learn实现的SVD（奇异值分解），从这个大矩阵里分解出一个含k个特制的N x k 小矩阵。
def reduce_to_k_dim(M, k=2):
    """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words)
        to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:
            - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html

        Params:
            M (numpy matrix of shape (number of corpus words, number of corpus words)): co-occurence matrix of word counts
            k (int): embedding size of each word after dimension reduction
        Return:
            M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings.
                    In terms of the SVD from math class, this actually returns U * S
    """
    n_iters = 10  # Use this parameter in your call to `TruncatedSVD`
    M_reduced = None
    print("Running Truncated SVD over %i words..." % (M.shape[0]))

    # Write your implementation here.
    svd = TruncatedSVD(n_components=k)
    svd.fit(M.T)
    M_reduced = svd.components_.T

    print("Done.")
    return M_reduced

# 问题1.4 实现 plot_embeddings
# 编写一个函数来绘制2D空间中的一组2D矢量。
# 基于matplotlib，用scatter 画 “×”，用 text 写字
def plot_embeddings(M_reduced, word2Ind, words):
    """ Plot in a scatterplot the embeddings of the words specified in the list "words".
        NOTE: do not plot all the words listed in M_reduced / word2Ind.
        Include a label next to each point.

        Params:
            M_reduced (numpy matrix of shape (number of unique words in the corpus , 2)): matrix of 2-dimensioal word embeddings
            word2Ind (dict): dictionary that maps word to indices for matrix M
            words (list of strings): words whose embeddings we want to visualize
    """
    # Write your implementation here.
    fig = plt.figure()
    plt.style.use("seaborn-whitegrid")
    for word in words:
        point = M_reduced[word2Ind[word]]
        plt.scatter(point[0], point[1], marker="^")
        plt.annotate(word, xy=(point[0], point[1]), xytext=(point[0], point[1] + 0.1))

# 测试解决方案图
print("-" * 80)
print("Outputted Plot:")

M_reduced_plot_test = np.array([[1, 1], [-1, -1], [1, -1], [-1, 1], [0, 0]])
word2Ind_plot_test = {'test1': 0, 'test2': 1, 'test3': 2, 'test4': 3, 'test5': 4}
words = ['test1', 'test2', 'test3', 'test4', 'test5']
plot_embeddings(M_reduced_plot_test, word2Ind_plot_test, words)

print("-" * 80)
print()

# 问题1.5：共现打印分析
# 将词嵌入到2个维度上，归一化，最终词向量会落到一个单位圆内，在坐标系上寻找相近的词。
reuters_corpus = read_corpus()
M_co_occurrence, word2Ind_co_occurrence = compute_co_occurrence_matrix(reuters_corpus)
M_reduced_co_occurrence = reduce_to_k_dim(M_co_occurrence, k=2)

# Rescale (normalize) the rows to make them each of unit-length
M_lengths = np.linalg.norm(M_reduced_co_occurrence, axis=1)
M_normalized = M_reduced_co_occurrence / M_lengths[:, np.newaxis] # broadcasting

words = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']

plot_embeddings(M_normalized, word2Ind_co_occurrence, words)
plt.show()

# Part 2：基于预测的词向量
# 使用gensim探索词向量，不是自己实现word2vec，所使用的词向量维度是300，由google发布。
def load_word2vec(embeddings_fp="./GoogleNews-vectors-negative300.bin"):
    """ Load Word2Vec Vectors
        Param:
            embeddings_fp (string) - path to .bin file of pretrained word vectors
        Return:
            wv_from_bin: All 3 million embeddings, each lengh 300
                This is the KeyedVectors format: https://radimrehurek.com/gensim/models/deprecated/keyedvectors.html
    """
    embed_size = 300
    print("Loading 3 million word vectors from file...")
    ## 自己下载的文件
    wv_from_bin = KeyedVectors.load_word2vec_format(embeddings_fp, binary=True)
    vocab = list(wv_from_bin.vocab.keys())
    print("Loaded vocab size %i" % len(vocab))
    return wv_from_bin
wv_from_bin = load_word2vec()
print()

# 首先使用SVD降维，将300维降2维，方便打印查看。
# 问题2.1：word2vec打印分析
# 和问题1.5一样
def get_matrix_of_vectors(wv_from_bin, required_words=['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']):
    """ Put the word2vec vectors into a matrix M.
        Param:
            wv_from_bin: KeyedVectors object; the 3 million word2vec vectors loaded from file
        Return:
            M: numpy matrix shape (num words, 300) containing the vectors
            word2Ind: dictionary mapping each word to its row number in M
    """
    import random
    words = list(wv_from_bin.vocab.keys())
    print("Shuffling words ...")
    random.shuffle(words)
    words = words[:10000]       # 选10000个加入
    print("Putting %i words into word2Ind and matrix M..." % len(words))
    word2Ind = {}
    M = []
    curInd = 0
    for w in words:
        try:
            M.append(wv_from_bin.word_vec(w))
            word2Ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    for w in required_words:
        try:
            M.append(wv_from_bin.word_vec(w))
            word2Ind[w] = curInd
            curInd += 1
        except KeyError:
            continue
    M = np.stack(M)
    print("Done.")
    return M, word2Ind

# 测试解决方案图
print("-" * 80)
print("Outputted Plot:")
print("-" * 80)

M, word2Ind = get_matrix_of_vectors(wv_from_bin)
M_reduced = reduce_to_k_dim(M, k=2)         # 减到了2维
plt.tight_layout()
words = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'venezuela']
plot_embeddings(M_reduced, word2Ind, words)
plt.show()

# 问题2.2：一词多义
# 找到一个有多个含义的词（比如 “leaves”，“scoop”），这种词的top-10相似词（根据余弦相似度）里有两个词的意思不一样。比如"leaves"（叶子，花瓣）的top-10词里有"vanishes"（消失）和"stalks"（茎秆）。
# 这里我找到的词是"column"（列），它的top-10里有"columnist"（专栏作家）和"article"（文章）
w0 = "column"
w0_mean = wv_from_bin.most_similar(w0)
print("column：", w0_mean)
print()

# 问题2.3：近义词和反义词
# 找到三个词(w1, w2, w3)，其中w1和w2是近义词，w1和w3是反义词，但是w1和w3的距离<w1和w2的距离。
# 例如：w1=“happy”，w2=“cheerful”，w3=“sad”
w1 = "love"
w2 = "like"
w3 = "hate"
w1_w2_dist = wv_from_bin.distance(w1, w2)
w1_w3_dist = wv_from_bin.distance(w1, w3)
print("Synonyms {}, {} have cosine distance: {}".format(w1, w2, w1_w2_dist))
print("Antonyms {}, {} have cosine distance: {}".format(w1, w3, w1_w3_dist))
print()

# 问题2.4：类比
# man 对于 king，相当于woman对于___，这样的问题也可以用word2vec来解决
# man : him :: woman : her
print("类比 man : him :: woman : her：")
pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'him'], negative=['man']))
print()


# 问题2.5：错误的类比
# 找到一个错误的类比，树：树叶 ：：花：花瓣
print("错误的类比 tree : leaf :: flower : petal：")
pprint.pprint(wv_from_bin.most_similar(positive=['leaf', 'flower'], negative=['tree']))
print()

# 问题2.6：偏见分析
# 注意偏见是很重要的比如性别歧视、种族歧视等，执行下面代码，分析两个问题：
# (a) 哪个词与“woman”和“boss”最相似，和“man”最不相似?
# (b) 哪个词与“man”和“boss”最相似，和“woman”最不相似?
print("偏见 woman : boss :: man：")
pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'boss'], negative=['man']))
print()
print("偏见 man : boss :: woman：")
pprint.pprint(wv_from_bin.most_similar(positive=['man', 'boss'], negative=['woman']))
print()

# 问题2.7：自行分析偏见
#     男人:女人 :: 医生:___
#     女人:男人 :: 医生:___
print("自行分析偏见 woman : doctor :: man：")
pprint.pprint(wv_from_bin.most_similar(positive=['woman', 'doctor'], negative=['man']))
print()
print("自行分析偏见 man : doctor :: woman：")
pprint.pprint(wv_from_bin.most_similar(positive=['man', 'doctor'], negative=['woman']))
print()
Alhfors
关注
0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
NLP-task5

基于预测的词向量目前，基于预测的词向量是最流行的，比如word2vec。现在我们来探索word2vec生成的词向量。这一部分主要是使用gensim探索词向量，不是自己实现word2vec，所使用的词向量维度是300，由google发布。# 导入包import sysassert sys.version_info[0]==3assert sys.version_info[1] >= 5from gensim.models import KeyedVectors # KeyedVec
复制链接

扫一扫