斯坦福cs224n-2021 assignment1-探索词向量—词共现矩阵—SVD（奇异值分解）

最新推荐文章于 2024-04-07 16:51:50 发布

Anki Stark

最新推荐文章于 2024-04-07 16:51:50 发布

阅读量371

点赞数

文章标签： nlp

本文链接：https://blog.csdn.net/visualstudio2018/article/details/127723208

版权

词共现矩阵：
通过统计一个事先指定大小（window_size）的窗口内的word共现次数，以word周边的共现词的次数做为当前word的vector。

SVD（奇异值分解）
基于共现矩阵得到的离散词向量存在着高维和稀疏性的问题，可对原始词向量进行降维，从而得到一个稠密的连续词向量

参考链接：
https://blog.csdn.net/m0_37565948/article/details/84989565
https://blog.csdn.net/m0_37565948/article/details/84990043

sanity check 的代码就不贴了。

# All Import Statements Defined Here
# Note: Do not add to this list.
# ----------------

import sys
assert sys.version_info[0]==3
assert sys.version_info[1] >= 5

from gensim.models import KeyedVectors
from gensim.test.utils import datapath
import pprint
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [10, 5]
import nltk
nltk.download('reuters')
from nltk.corpus import reuters
import numpy as np
import random
import scipy as sp
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import PCA

START_TOKEN = '<START>'
END_TOKEN = '<END>'

np.random.seed(0)
random.seed(0)
# ----------------

def read_corpus(category="crude"):
    """ Read files from the specified Reuter's category.
        Params:
            category (string): category name
        Return:
            list of lists, with words from each of the processed files
    """
    files = reuters.fileids(category)
    return [[START_TOKEN] + [w.lower() for w in list(reuters.words(f))] + [END_TOKEN] for f in files]

reuters_corpus = read_corpus()
pprint.pprint(reuters_corpus[:3], compact=True, width=100)

数据例子：
在这里插入图片描述
单词去重：

def distinct_words(corpus):
    """ Determine a list of distinct words for the corpus.
        Params:
            corpus (list of list of strings): corpus of documents
        Return:
            corpus_words (list of strings): sorted list of distinct words across the corpus
            num_corpus_words (integer): number of distinct words across the corpus
    """
    corpus_words = []
    num_corpus_words = -1
    
    # ------------------
    # Write your implementation here.
    corpus_words = sorted(list(set(word for doc in corpus for word in doc)))
    #另一种写法：
    # corpus_words = {word for doc in corpus for word in doc}
    # corpus_words = sorted(list(corpus_words))
    num_corpus_words = len(corpus_words)
    # ------------------

    return corpus_words, num_corpus_words

计算词共现矩阵，默认窗口大小是4。

def compute_co_occurrence_matrix(corpus, window_size=4):
    """ Compute co-occurrence matrix for the given corpus and window_size (default of 4).
    
        Note: Each word in a document should be at the center of a window. Words near edges will have a smaller
              number of co-occurring words.
              
              For example, if we take the document "<START> All that glitters is not gold <END>" with window size of 4,
              "All" will co-occur with "<START>", "that", "glitters", "is", and "not".
    
        Params:
            corpus (list of list of strings): corpus of documents
            window_size (int): size of context window
        Return:
            M (a symmetric numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): 
                Co-occurence matrix of word counts. 
                The ordering of the words in the rows/columns should be the same as the ordering of the words given by the distinct_words function.
            word2ind (dict): dictionary that maps word to index (i.e. row/column number) for matrix M.
    """
    words, num_words = distinct_words(corpus)
    M = None
    word2ind = {}
    
    # ------------------
    # Write your implementation here.
    M = np.zeros([num_words,num_words])
    for i, word in enumerate(words):
         word2ind[word] = i

    for doc in corpus:
        for cur_idx, word in enumerate(doc):
            for window_idx in range(-window_size, window_size + 1):
                neighbor_idx = cur_idx + window_idx
                if neighbor_idx < 0 or neighbor_idx >= len(doc) or neighbor_idx == cur_idx:
                    continue

                co_occur_word = doc[neighbor_idx]
                (word_idx, co_occur_idx) = (word2ind[word], word2ind[co_occur_word])
                M[word_idx][co_occur_idx] += 1
    # ------------------

    return M, word2ind

降维：
构造一个对矩阵进行降维的方法来产生k维嵌入

def reduce_to_k_dim(M, k=2):
    """ Reduce a co-occurence count matrix of dimensionality (num_corpus_words, num_corpus_words)
        to a matrix of dimensionality (num_corpus_words, k) using the following SVD function from Scikit-Learn:
            - http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.TruncatedSVD.html
    
        Params:
            M (numpy matrix of shape (number of unique words in the corpus , number of unique words in the corpus)): co-occurence matrix of word counts
            k (int): embedding size of each word after dimension reduction
        Return:
            M_reduced (numpy matrix of shape (number of corpus words, k)): matrix of k-dimensioal word embeddings.
                    In terms of the SVD from math class, this actually returns U * S
    """    
    n_iters = 10     # Use this parameter in your call to `TruncatedSVD`
    M_reduced = None
    print("Running Truncated SVD over %i words..." % (M.shape[0]))
    
        # ------------------
        # Write your implementation here.
    svd = TruncatedSVD(n_components = k, n_iter = n_iters)
    M_reduced = svd.fit_transform(M)
    
        # ------------------

    print("Done.")
    return M_reduced

绘制二维空间中的一组二维向量：

def plot_embeddings(M_reduced, word2ind, words):
    """ Plot in a scatterplot the embeddings of the words specified in the list "words".
        NOTE: do not plot all the words listed in M_reduced / word2ind.
        Include a label next to each point.
    
        Params:
            M_reduced (numpy matrix of shape (number of unique words in the corpus , 2)): matrix of 2-dimensioal word embeddings
            word2ind (dict): dictionary that maps word to indices for matrix M
            words (list of strings): words whose embeddings we want to visualize
    """
    
    # ------------------
    # Write your implementation here.

    for i in words:
        coordinate = M_reduced[word2ind[i]]
        x, y = coordinate[0],coordinate[1]
        plt.scatter(x,y)
        plt.annotate(i,(x,y))

    # ------------------

main函数：

# -----------------------------
# Run This Cell to Produce Your Plot
# ------------------------------
reuters_corpus = read_corpus()
M_co_occurrence, word2ind_co_occurrence = compute_co_occurrence_matrix(reuters_corpus)
M_reduced_co_occurrence = reduce_to_k_dim(M_co_occurrence, k=2)

# Rescale (normalize) the rows to make them each of unit-length
M_lengths = np.linalg.norm(M_reduced_co_occurrence, axis=1)
M_normalized = M_reduced_co_occurrence / M_lengths[:, np.newaxis] # broadcasting

words = ['barrels', 'bpd', 'ecuador', 'energy', 'industry', 'kuwait', 'oil', 'output', 'petroleum', 'iraq']

plot_embeddings(M_normalized, word2ind_co_occurrence, words)