NLP词向量

分词

Word2vec

Word2vec分为skip-gram和CBOW(continuous bag of words),前者是通过中心词预测窗口词,后者是通过窗口预测中心词。

数据预处理

   K = 10   # 负样本随机采样数量
    C = 3    # 周围单词的数量
    NUM_EPOCHS = 2 
    VOCAB_SIZE = 30000
    BATCH_SIZE = 128
    LEARNING_RATE = 0.2 
    EMBEDDING_SIZE = 100      
    LOG_FILE = "word_embedding.log"
    #print(os.getcwd())
    with open("text8.train.txt", "r") as file:
        text = file.read() 
        
    text = [w for w in (text.lower()).split()] 
    vocab = dict(Counter(text).most_common(VOCAB_SIZE-1))

    vocab["<unk>"] = len(text) - np.sum(list(vocab.values()))
    print(np.sum(list(vocab.values())),vocab["<unk>"])
    idx_to_word = [word for word in vocab.keys()] 
    word_to_idx = {word:i for i, word in enumerate(idx_to_word)}

    word_counts = np.array([count for count in vocab.values()], dtype=np.float32)
    word_freqs = word_counts / np.sum(word_counts)
    word_freqs = word_freqs ** (3./4.)
    word_freqs = word_freqs / np.sum(word_freqs)  # 

这里K是负样本的采集数量,c是正样本的窗口,左右各取c个,EMBEDDING_SIZE是词向量的维度。下边是英文的数据生成方式,英文只需要根据空格来划分词即可。然后使用Counter统计每个词的词频,ind_to_word和word_to_index记录了每个词及其label,word_freqs是用来做negative sampling的具体配合torch.multinomial频率越大被取到的几率越大。

print(os.getcwd())
    lines = []
    with open("1.txt", "r",encoding='utf-8',errors='ignore') as f:
        lines = f.readlines()

    # 清理数据
    data = []
    for line in lines:
        line = line.strip()
        if line != "":
            data.append(line)

    # 分词
    with open("p_input.txt", "w", encoding="utf-8") as f:
        f.write("\n".join(data))

    thu1 = thulac.thulac(seg_only=True)
    thu1.cut_f("p_input.txt", "p_output.txt")

    # 标点符号集
    stopwords = '''~!@#$%^&*()_+`1234567890-={}[]::";'<>,.?/|\、·!()¥“”‘’《》,。?/—-【】….'''
    stopwords_set = set([i for i in stopwords])
    stopwords_set.add("br")  # 异常词也加入此集,方便去除

    with open("p_output.txt", "r", encoding="utf-8") as f:
        lines = f.readlines()

    # 数据清理
    data = []
    for line in lines:
        for s in stopwords_set:
            line = line.strip().replace(s, "")
        line = line.replace("   ", " ").replace("  ", " ")
        if line != "" and line != " ":
            data.append(line)

    # 保存数据
    with open("all.txt", "w", encoding="utf-8") as f:
        f.write(" ".join(data))
    all_text = ""
    with open("all.txt", "r", encoding="utf-8") as f:
        all_text = f.readline()

    all_len = len(all_text)
    train_text = all_text[:int(all_len * 0.9)]
    dev_text = all_text[int(all_len * 0.9):int(all_len * 0.95)]
    test_text = all_text[int(all_len * 0.95):]

    with open("dev.txt", "w", encoding="utf-8") as f:
        f.write(dev_text)
    with open("test.txt", "w", encoding="utf-8") as f:
        f.write(test_text)
    with open("train.txt", "w", encoding="utf-8") as f:
        f.write(train_text)
    text = ""
    with open("train.txt", "r", encoding="utf-8") as f:
        text = f.read()

    text = text.lower().split()  # 分割成单词列表
    vocab = dict(Counter(text).most_common(VOCAB_SIZE - 1))  # 得到单词字典表,key是单词,value是次数
    vocab['<UNK>'] = len(text) - np.sum(list(vocab.values()))
    # text = [w for w in (text.lower()).split()]
    # vocab = dict(Counter(text).most_common(VOCAB_SIZE - 1))

    #vocab["<unk>"] = len(text) - np.sum(list(vocab.values()))
    #print(np.sum(list(vocab.values())), vocab["<unk>"])
    idx_to_word = [word for word in vocab.keys()]
    word_to_idx = {word: i for i, word in enumerate(idx_to_word)}

    word_counts = np.array([count for count in vocab.values()], dtype=np.float32)
    word_freqs = word_counts / np.sum(word_counts)
    word_freqs = word_freqs ** (3. / 4.)
    word_freqs = word_freqs / np.sum(word_freqs)

中文获取其词向量复杂一些,首先要明确你文件的编码格式gdk,utf等等,其次需要使用分词工具将句子分词词这里使用的是thulac,接着要去除中文常见的标点符号最后的处理步骤和英文的没有区别

skip-gram

class Dataset(tud.Dataset):  # 继承tud.Dataset父类

    def __init__(self, text, word_to_idx, idx_to_word, word_freqs, word_counts):
        super(Dataset, self).__init__()
        self.text_encoded = [word_to_idx.get(t, VOCAB_SIZE - 1) for t in text]
        self.text_encoded = torch.Tensor(self.text_encoded).long()
        self.word_to_idx = word_to_idx
        self.idx_to_word = idx_to_word
        self.word_freqs = torch.Tensor(word_freqs)
        self.word_counts = torch.Tensor(word_counts)

    def __len__(self):
        return len(self.text_encoded)

    def __getitem__(self, idx):
        ''' 这个function返回以下数据用于训练
            - 中心词
            - 这个单词附近的(positive)单词
            - 随机采样的K个单词作为negative sample
        '''
        center_word = self.text_encoded[idx]
        pos_indices = list(range(idx - C, idx)) + list(range(idx + 1, idx + C + 1))
        pos_indices = [i % len(self.text_encoded) for i in pos_indices]
        pos_words = self.text_encoded[pos_indices]
        # replacement=True有放回的取
        neg_words = torch.multinomial(self.word_freqs, K * pos_words.shape[0], replacement=True)

        return center_word, pos_words, neg_words

这个和cv的dataset建立形式是一致的,输出有label形式是tensor int,还有positive sample是在词窗口内word的id,negative sample就是根据词频取得word的id。

class EmbeddingModel(nn.Module):

    def __init__(self, vocab_size, embed_size):
        super(EmbeddingModel, self).__init__()
        self.vocab_size = vocab_size  # 30000
        self.embed_size = embed_size  # 100
        # 模型输入,输出是两个一样的矩阵参数nn.Embedding(30000, 100)
        self.in_embed = nn.Embedding(self.vocab_size, self.embed_size, sparse=False)
        self.out_embed = nn.Embedding(self.vocab_size, self.embed_size, sparse=False)
        # 模型权重初始化
        initrange = 0.5 / self.embed_size
        self.in_embed.weight.data.uniform_(-initrange, initrange)
        self.out_embed.weight.data.uniform_(-initrange, initrange)

    def forward(self, input_labels, pos_labels, neg_labels):
        '''
        input_labels: 中心词,         [batch_size]
        pos_labels: 中心词周围词       [batch_size * (c * 2)]
        neg_labelss: 中心词负采样单词  [batch_size, (c * 2 * K)]
        return: loss, 返回loss        [batch_size]
        '''
        batch_size = input_labels.size(0)
        input_embedding = self.in_embed(input_labels)  # B * embed_size
        pos_embedding = self.out_embed(pos_labels)  # B * (2C) * embed_size
        neg_embedding = self.out_embed(neg_labels)  # B * (2*C*K) * embed_size

        # torch.bmm()为batch间的矩阵相乘(b,n.m)*(b,m,p)=(b,n,p)
        log_pos = torch.bmm(pos_embedding, input_embedding.unsqueeze(2)).squeeze()  # B * (2*C)
        log_neg = torch.bmm(neg_embedding, -input_embedding.unsqueeze(2)).squeeze()  # B * (2*C*K)

        # 下面loss计算就是论文里的公式
        log_pos = F.logsigmoid(log_pos).sum(1)  # batch_size
        log_neg = F.logsigmoid(log_neg).sum(1)  # batch_size
        loss = log_pos + log_neg  # 正样本损失和负样本损失和尽量最大
        return -loss

        # 模型训练有两个矩阵self.in_embed和self.out_embed, 作者认为输入矩阵比较好

    def input_embeddings(self):
        return self.in_embed.weight.data.cpu().numpy()
        

logsigmoid函数公式和图片
在这里插入图片描述

在这里插入图片描述

CBOW

CBOW只需要改变dataset中的label,negative sample,positive sample的顺序。label变成了中心词的窗口。后续model中注意 torch.bmm和logsigmoid中的维度对齐。

glove

glove原理请查看

class GloVeModel(nn.Module):
    """Implement GloVe model with Pytorch
    """

    def __init__(self, embedding_size, context_size, vocab_size, min_occurrance=1, x_max=100, alpha=3 / 4):
        super(GloVeModel, self).__init__()

        self.embedding_size = embedding_size
        if isinstance(context_size, tuple):
            self.left_context, self.right_context = context_size
        if isinstance(context_size, int):
            self.left_context = self.right_context = context_size
        else:
            raise ValueError(
                "'context_size' should be an int or a tuple of two ints")
        self.vocab_size = vocab_size
        self.alpha = alpha
        self.min_occurrance = min_occurrance
        self.x_max = x_max

        self._focal_embeddings = nn.Embedding(
            vocab_size, embedding_size).type(torch.float64)
        self._context_embeddings = nn.Embedding(
            vocab_size, embedding_size).type(torch.float64)
        self._focal_biases = nn.Embedding(vocab_size, 1).type(torch.float64)
        self._context_biases = nn.Embedding(vocab_size, 1).type(torch.float64)
        self._glove_dataset = None

        for params in self.parameters():
            init.uniform_(params, a=-1, b=1)

    def fit(self, corpus):
        """get dictionary word list and co-occruence matrix from corpus

        Args:
            corpus (list): contain word id list

        Raises:
            ValueError: when count zero cocurrences will raise the problems
        """

        left_size, right_size = self.left_context, self.right_context
        vocab_size, min_occurrance = self.vocab_size, self.min_occurrance

        # get co-occurence count matrix
        word_counts = Counter()
        cooccurence_counts = defaultdict(float)
        for region in corpus:
            word_counts.update(region)
            for left_context, word, right_context in _context_windows(region, left_size, right_size):
                for i, context_word in enumerate(left_context[::-1]):
                    # add (1 / distance from focal word) for this pair
                    cooccurence_counts[(word, context_word)] += 1 / (i + 1)
                for i, context_word in enumerate(right_context):
                    cooccurence_counts[(word, context_word)] += 1 / (i + 1)
        if len(cooccurence_counts) == 0:
            raise ValueError(
                "No coccurrences in corpus, Did you try to reuse a generator?")

        # get words bag information
        tokens = [word for word, count in
                  word_counts.most_common(vocab_size) if count >= min_occurrance]
        coocurrence_matrix = [(words[0], words[1], count)
                              for words, count in cooccurence_counts.items()
                              if words[0] in tokens and words[1] in tokens]
        self._glove_dataset = GloVeDataSet(coocurrence_matrix)

    def train(self, num_epoch, device, batch_size=512, learning_rate=0.05, loop_interval=10):
        """Training GloVe model

        Args:
            num_epoch (int): number of epoch
            device (str): cpu or gpu
            batch_size (int, optional): Defaults to 512.
            learning_rate (float, optional): Defaults to 0.05. learning rate for Adam optimizer
            batch_interval (int, optional): Defaults to 100. interval time to show average loss

        Raises:
            NotFitToCorpusError: if the model is not fit by corpus, the error will be raise
        """

        if self._glove_dataset is None:
            raise NotFitToCorpusError(
                "Please fit model with corpus before training")

        # basic training setting
        optimizer = optim.Adam(self.parameters(), lr=learning_rate)
        glove_dataloader = DataLoader(self._glove_dataset, batch_size)
        total_loss = 0

        for epoch in range(num_epoch):
            for idx, batch in enumerate(glove_dataloader):
                optimizer.zero_grad()

                i_s, j_s, counts = batch
                i_s = i_s.to(device)
                j_s = j_s.to(device)
                counts = counts.to(device)
                loss = self._loss(i_s, j_s, counts)

                total_loss += loss.item()
                if idx % loop_interval == 0:
                    avg_loss = total_loss / loop_interval
                    print("epoch: {}, current step: {}, average loss: {}".format(
                        epoch, idx, avg_loss))
                    total_loss = 0

                loss.backward()
                optimizer.step()

        print("finish glove vector training")

    def get_coocurrance_matrix(self):
        """ Return co-occurance matrix for saving

        Returns:
            list: list itam (word_idx1, word_idx2, cooccurances)
        """

        return self._glove_dataset._coocurrence_matrix

    def embedding_for_tensor(self, tokens):
        if not torch.is_tensor(tokens):
            raise ValueError("the tokens must be pytorch tensor object")

        return self._focal_embeddings(tokens) + self._context_embeddings(tokens)

    def _loss(self, focal_input, context_input, coocurrence_count):
        x_max, alpha = self.x_max, self.alpha

        focal_embed = self._focal_embeddings(focal_input)
        context_embed = self._context_embeddings(context_input)
        focal_bias = self._focal_biases(focal_input)
        context_bias = self._context_biases(context_input)

        # count weight factor
        weight_factor = torch.pow(coocurrence_count / x_max, alpha)
        weight_factor[weight_factor > 1] = 1

        embedding_products = torch.sum(focal_embed * context_embed, dim=1)
        log_cooccurrences = torch.log(coocurrence_count)

        distance_expr = (embedding_products + focal_bias +
                         context_bias + log_cooccurrences) ** 2

        single_losses = weight_factor * distance_expr
        mean_loss = torch.mean(single_losses)
        return mean_loss


class GloVeDataSet(Dataset):

    def __init__(self, coocurrence_matrix):
        self._coocurrence_matrix = coocurrence_matrix

    def __getitem__(self, index):
        return self._coocurrence_matrix[index]

    def __len__(self):
        return len(self._coocurrence_matrix)


class NotTrainedError(Exception):
    pass


class NotFitToCorpusError(Exception):
    pass


def _context_windows(region, left_size, right_size):
    """generate left_context, word, right_context tuples for each region

    Args:
        region (str): a sentence
        left_size (int): left windows size
        right_size (int): right windows size
    """

    for i, word in enumerate(region):
        start_index = i - left_size
        end_index = i + right_size
        left_context = _window(region, start_index, i - 1)
        right_context = _window(region, i + 1, end_index)
        yield (left_context, word, right_context)


def _window(region, start_index, end_index):
    """Returns the list of words starting from `start_index`, going to `end_index`
    taken from region. If `start_index` is a negative number, or if `end_index`
    is greater than the index of the last word in region, this function will pad
    its return value with `NULL_WORD`.

    Args:
        region (str): the sentence for extracting the token base on the context
        start_index (int): index for start step of window
        end_index (int): index for the end step of window
    """
    last_index = len(region) + 1
    selected_tokens = region[max(start_index, 0):
                             min(end_index, last_index) + 1]
    return selected_tokens

主要通过fit来创建共现矩阵,调用了_context_windows来返回中心词的左右窗口,cooccurence_counts来统计每个词之间的距离关系。
_loss函数对应了glove的loss计算。
在这里插入图片描述

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

小涵涵

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值