NLP数据预处理

最新推荐文章于 2024-07-20 19:16:01 发布

Lzj000lzj

最新推荐文章于 2024-07-20 19:16:01 发布

阅读量677

点赞数

分类专栏： nlp 文章标签： pre

nlp 专栏收录该内容

11 篇文章 0 订阅

订阅专栏

词的编码以及取batch

import torch
import os


class Dictionary(object):
    def __init__(self):
        self.word2idx = {}
        self.idx2word = {}
        self.idx = 0
    
    def add_word(self, word):
        if not word in self.word2idx:#in是在dict的key中看是否存在
            self.word2idx[word] = self.idx
            self.idx2word[self.idx] = word
            self.idx += 1#在加入word时同时也将id加入到dict中
    def __len__(self):
        return len(self.word2idx)


class Corpus(object):
    def __init__(self):
        self.dictionary = Dictionary()

    def get_data(self, path, batch_size=20):
        # Add words to the dictionary
        with open(path, 'r') as f:
            tokens = 0
            for line in f:
                words = line.split() + ['<eos>']
                tokens += len(words)
                for word in words: 
                    self.dictionary.add_word(word)  
        
        # Tokenize the file content
        ids = torch.LongTensor(tokens)
        token = 0
        with open(path, 'r') as f:
            for line in f:
                words = line.split() + ['<eos>']
                for word in words:
                    ids[token] = self.dictionary.word2idx[word]#将句子用已处理好的id表示出来
                    token += 1#token表示的出现的第几个词
        num_batches = ids.size(0) // batch_size#共有几个batch
        ids = ids[:num_batches*batch_size]#batch取整
        return ids.view(batch_size, -1)