SimpleDataset和CorpusDataset的简单使用

最新推荐文章于 2025-03-03 19:57:29 发布

sinat_24395003

最新推荐文章于 2025-03-03 19:57:29 发布

阅读量589

点赞数

分类专栏： gluonnlp

本文链接：https://blog.csdn.net/sinat_24395003/article/details/108398459

版权

gluonnlp 专栏收录该内容

19 篇文章

订阅专栏

本文介绍了一个基于GluonNLP库的文本数据处理方法，包括如何使用不同的字符串分割器、如何按字或空白格进行分割、如何处理字符串前后添加标识符以及如何将多个样本拼接为一个。通过具体的代码示例，展示了如何创建和使用SimpleDataset和CorpusDataset类来处理和组织文本数据。

摘要生成于 C知道，由 DeepSeek-R1 满血版支持，前往体验 >

import io
import os
from gluonnlp.data.dataset import Dataset


def line_splitter(s):
    """Split a string at newlines.
    按行分割字符串
    Parameters
    ----------
    s : str
        The string to be split

    Returns
    --------
    List[str]
        List of strings. Obtained by calling s.splitlines().

    """
    return s.splitlines()


def whitespace_splitter(s):
    """Split a string at whitespace (space, tab, newline, return, formfeed).
    按空白格分割字符串
    Parameters
    ----------
    s : str
        The string to be split

    Returns
    --------
    List[str]
        List of strings. Obtained by calling s.split().
    """
    return s.split()


def wordtoword_splitter(s):
    """按字分割"""
    return list(s)


def _corpus_dataset_process(s, bos, eos):
    """字符串前后是否加bos,eos标识符"""
    tokens = [bos] if bos else []
    tokens.extend(s)
    if eos:
        tokens.append(eos)
    return tokens


def concat_sequence(sequences):
    """Concatenate sequences of tokens into a single flattened list of tokens.
    拼接列表中的列表变成一个列表，即多个句子拼接成一个句子
    Parameters
    ----------
    sequences : list of list of object
        Sequences of tokens, each of which is an iterable of tokens.

    Returns
    -------
    Flattened list of tokens.

    """
    return [token for seq in sequences for token in seq if token]


class SimpleDataset(Dataset):
    """Simple Dataset wrapper for lists and arrays.

    Parameters
    ----------
    data : dataset-like object
        Any object that implements `len()` and `[]`.
    """

    def __init__(self, data):
        self._data = data

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        return self._data[idx]


class CorpusDataset(SimpleDataset):
    """Common text dataset that reads a whole corpus based on provided sample splitter
    and word tokenizer.

    The returned dataset includes samples, each of which can either be a list of tokens if tokenizer
    is specified, or otherwise a single string segment produced by the sample_splitter.

    Parameters
    ----------
    filename : str or list of str
        Path to the input text file or list of paths to the input text files.
    encoding : str, default 'utf8'
        File encoding format.
    flatten : bool, default False #是否拼接多个样本
        Whether to return all samples as flattened tokens. If True, each sample is a token.
    skip_empty : bool, default True 是否跳过字符串为空的样本
        Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos`
        will be added in empty samples.
    sample_splitter : function, default str.splitlines
        A function that splits the dataset string into samples.
    tokenizer : function or None, default str.split
        A function that splits each sample string into list of tokens. If None, raw samples are
        returned according to `sample_splitter`.
    bos : str or None, default None
        The token to add at the beginning of each sequence. If None, or if tokenizer is not
        specified, then nothing is added.
    eos : str or None, default None
        The token to add at the end of each sequence. If None, or if tokenizer is not
        specified, then nothing is added.
    """

    def __init__(self, filename, encoding='utf8', flatten=False, skip_empty=True,
                 sample_splitter=line_splitter, tokenizer=whitespace_splitter,
                 bos=None, eos=None):
        assert sample_splitter, 'sample_splitter must be specified.'

        if not isinstance(filename, (tuple, list)):
            filename = (filename,)

        self._filenames = [os.path.expanduser(f) for f in filename]
        self._encoding = encoding
        self._flatten = flatten
        self._skip_empty = skip_empty
        self._sample_splitter = sample_splitter
        self._tokenizer = tokenizer
        self._bos = bos
        self._eos = eos
        super(CorpusDataset, self).__init__(self._read())

    def _read(self):
        all_samples = []
        for filename in self._filenames:
            with io.open(filename, 'r', encoding=self._encoding) as fin:
                content = fin.read()
            samples = (s.strip() for s in self._sample_splitter(content))
            if self._tokenizer:
                samples = [
                    _corpus_dataset_process(self._tokenizer(s), self._bos, self._eos)
                    for s in samples if s or not self._skip_empty
                ]
                if self._flatten:
                    samples = concat_sequence(samples)
            elif self._skip_empty:
                samples = [s for s in samples if s]

            all_samples += samples
        return all_samples


if __name__ == '__main__':
    sd = SimpleDataset([1, 2, 3])
    print(sd[0])
    print(list(sd))
    d=[]
    sd = SimpleDataset([[1, 2, 3],[1, 2, 3]])
    print('sd[0]: ',sd[0])
    d.extend(sd) #extend会把数据给取出来
    print('d: ',d)
    cd = CorpusDataset('testcorpus.txt', tokenizer=wordtoword_splitter)
    print(cd._data)

    cd1 = CorpusDataset('testcorpus.txt', tokenizer=wordtoword_splitter, flatten=True, eos='<end>')
    print(cd1._data)

1
[1, 2, 3]
sd[0]: [1, 2, 3]
d: [[1, 2, 3], [1, 2, 3]]
[['新', '浪', '娱', '乐', '讯', ' ', '北', '京', '时', '间', '9', '月', '4', '日', '消', '息', '，', '据', '《', '名', '利', '场', '》', '报', '道', '称', '，', '罗', '伯', '特', '·', '帕', '丁', '森', '确', '诊', '新', '冠', '阳', '性', '，', '他', '主', '演', '的', '新', '《', '蝙', '蝠', '侠', '》', '电', '影', '拍', '摄', '也', '暂', '停', '。'], ['不', '久', '前', '，', '《', '每', '日', '邮', '报', '》', '曝', '出', '该', '片', '有', '一', '名', '剧', '组', '人', '员', '感', '染', '新', '冠', '，', '刚', '在', '英', '国', '复', '工', '几', '天', '的', '影', '片', '拍', '摄', '也', '因', '此', '暂', '停', '（', '但', '报', '道', '用', '的', '是', 'c', 'r', 'e', 'w', '，', '而', '非', 'c', 'a', 's', 't', '，', '即', '是', '指', '幕', '后', '工', '作', '人', '员', '而', '非', '演', '员', '）', '。', '两', '小', '时', '后', '，', '华', '纳', '确', '认', '有', '一', '名', '《', '蝙', '蝠', '侠', '》', '制', '作', '团', '队', '成', '员', '感', '染', '了', '新', '冠', '，', '并', '简', '短', '确', '认', '了', '拍', '摄', '暂', '停', '一', '事', '，', '按', '惯', '例', '这', '份', '声', '明', '没', '有', '透', '露', '感', '染', '者', '身', '份', '，', '只', '表', '示', '其', '按', '规', '定', '在', '隔', '离', '中', '。'], ['而', '又', '是', '两', '小', '时', '后', '，', '《', '名', '利', '场', '》', '称', '另', '有', '高', '层', '消', '息', '源', '称', '是', '帕', '丁', '森', '新', '冠', '检', '测', '阳', '性', '。', '他', '的', '代', '理', '人', '尚', '未', '就', '此', '报', '道', '做', '出', '回', '复', '。']]
['新', '浪', '娱', '乐', '讯', ' ', '北', '京', '时', '间', '9', '月', '4', '日', '消', '息', '，', '据', '《', '名', '利', '场', '》', '报', '道', '称', '，', '罗', '伯', '特', '·', '帕', '丁', '森', '确', '诊', '新', '冠', '阳', '性', '，', '他', '主', '演', '的', '新', '《', '蝙', '蝠', '侠', '》', '电', '影', '拍', '摄', '也', '暂', '停', '。', '<end>', '不', '久', '前', '，', '《', '每', '日', '邮', '报', '》', '曝', '出', '该', '片', '有', '一', '名', '剧', '组', '人', '员', '感', '染', '新', '冠', '，', '刚', '在', '英', '国', '复', '工', '几', '天', '的', '影', '片', '拍', '摄', '也', '因', '此', '暂', '停', '（', '但', '报', '道', '用', '的', '是', 'c', 'r', 'e', 'w', '，', '而', '非', 'c', 'a', 's', 't', '，', '即', '是', '指', '幕', '后', '工', '作', '人', '员', '而', '非', '演', '员', '）', '。', '两', '小', '时', '后', '，', '华', '纳', '确', '认', '有', '一', '名', '《', '蝙', '蝠', '侠', '》', '制', '作', '团', '队', '成', '员', '感', '染', '了', '新', '冠', '，', '并', '简', '短', '确', '认', '了', '拍', '摄', '暂', '停', '一', '事', '，', '按', '惯', '例', '这', '份', '声', '明', '没', '有', '透', '露', '感', '染', '者', '身', '份', '，', '只', '表', '示', '其', '按', '规', '定', '在', '隔', '离', '中', '。', '<end>', '而', '又', '是', '两', '小', '时', '后', '，', '《', '名', '利', '场', '》', '称', '另', '有', '高', '层', '消', '息', '源', '称', '是', '帕', '丁', '森', '新', '冠', '检', '测', '阳', '性', '。', '他', '的', '代', '理', '人', '尚', '未', '就', '此', '报', '道', '做', '出', '回', '复', '。', '<end>']