import io import os from gluonnlp.data.dataset import Dataset def line_splitter(s): """Split a string at newlines. 按行分割字符串 Parameters ---------- s : str The string to be split Returns -------- List[str] List of strings. Obtained by calling s.splitlines(). """ return s.splitlines() def whitespace_splitter(s): """Split a string at whitespace (space, tab, newline, return, formfeed). 按空白格分割字符串 Parameters ---------- s : str The string to be split Returns -------- List[str] List of strings. Obtained by calling s.split(). """ return s.split() def wordtoword_splitter(s): """按字分割""" return list(s) def _corpus_dataset_process(s, bos, eos): """字符串前后是否加bos,eos标识符""" tokens = [bos] if bos else [] tokens.extend(s) if eos: tokens.append(eos) return tokens def concat_sequence(sequences): """Concatenate sequences of tokens into a single flattened list of tokens. 拼接列表中的列表变成一个列表,即多个句子拼接成一个句子 Parameters ---------- sequences : list of list of object Sequences of tokens, each of which is an iterable of tokens. Returns ------- Flattened list of tokens. """ return [token for seq in sequences for token in seq if token] class SimpleDataset(Dataset): """Simple Dataset wrapper for lists and arrays. Parameters ---------- data : dataset-like object Any object that implements `len()` and `[]`. """ def __init__(self, data): self._data = data def __len__(self): return len(self._data) def __getitem__(self, idx): return self._data[idx] class CorpusDataset(SimpleDataset): """Common text dataset that reads a whole corpus based on provided sample splitter and word tokenizer. The returned dataset includes samples, each of which can either be a list of tokens if tokenizer is specified, or otherwise a single string segment produced by the sample_splitter. Parameters ---------- filename : str or list of str Path to the input text file or list of paths to the input text files. encoding : str, default 'utf8' File encoding format. flatten : bool, default False #是否拼接多个样本 Whether to return all samples as flattened tokens. If True, each sample is a token. skip_empty : bool, default True 是否跳过字符串为空的样本 Whether to skip the empty samples produced from sample_splitters. If False, `bos` and `eos` will be added in empty samples. sample_splitter : function, default str.splitlines A function that splits the dataset string into samples. tokenizer : function or None, default str.split A function that splits each sample string into list of tokens. If None, raw samples are returned according to `sample_splitter`. bos : str or None, default None The token to add at the beginning of each sequence. If None, or if tokenizer is not specified, then nothing is added. eos : str or None, default None The token to add at the end of each sequence. If None, or if tokenizer is not specified, then nothing is added. """ def __init__(self, filename, encoding='utf8', flatten=False, skip_empty=True, sample_splitter=line_splitter, tokenizer=whitespace_splitter, bos=None, eos=None): assert sample_splitter, 'sample_splitter must be specified.' if not isinstance(filename, (tuple, list)): filename = (filename,) self._filenames = [os.path.expanduser(f) for f in filename] self._encoding = encoding self._flatten = flatten self._skip_empty = skip_empty self._sample_splitter = sample_splitter self._tokenizer = tokenizer self._bos = bos self._eos = eos super(CorpusDataset, self).__init__(self._read()) def _read(self): all_samples = [] for filename in self._filenames: with io.open(filename, 'r', encoding=self._encoding) as fin: content = fin.read() samples = (s.strip() for s in self._sample_splitter(content)) if self._tokenizer: samples = [ _corpus_dataset_process(self._tokenizer(s), self._bos, self._eos) for s in samples if s or not self._skip_empty ] if self._flatten: samples = concat_sequence(samples) elif self._skip_empty: samples = [s for s in samples if s] all_samples += samples return all_samples if __name__ == '__main__': sd = SimpleDataset([1, 2, 3]) print(sd[0]) print(list(sd)) d=[] sd = SimpleDataset([[1, 2, 3],[1, 2, 3]]) print('sd[0]: ',sd[0]) d.extend(sd) #extend会把数据给取出来 print('d: ',d) cd = CorpusDataset('testcorpus.txt', tokenizer=wordtoword_splitter) print(cd._data) cd1 = CorpusDataset('testcorpus.txt', tokenizer=wordtoword_splitter, flatten=True, eos='<end>') print(cd1._data)
1
[1, 2, 3]
sd[0]: [1, 2, 3]
d: [[1, 2, 3], [1, 2, 3]]
[['新', '浪', '娱', '乐', '讯', ' ', '北', '京', '时', '间', '9', '月', '4', '日', '消', '息', ',', '据', '《', '名', '利', '场', '》', '报', '道', '称', ',', '罗', '伯', '特', '·', '帕', '丁', '森', '确', '诊', '新', '冠', '阳', '性', ',', '他', '主', '演', '的', '新', '《', '蝙', '蝠', '侠', '》', '电', '影', '拍', '摄', '也', '暂', '停', '。'], ['不', '久', '前', ',', '《', '每', '日', '邮', '报', '》', '曝', '出', '该', '片', '有', '一', '名', '剧', '组', '人', '员', '感', '染', '新', '冠', ',', '刚', '在', '英', '国', '复', '工', '几', '天', '的', '影', '片', '拍', '摄', '也', '因', '此', '暂', '停', '(', '但', '报', '道', '用', '的', '是', 'c', 'r', 'e', 'w', ',', '而', '非', 'c', 'a', 's', 't', ',', '即', '是', '指', '幕', '后', '工', '作', '人', '员', '而', '非', '演', '员', ')', '。', '两', '小', '时', '后', ',', '华', '纳', '确', '认', '有', '一', '名', '《', '蝙', '蝠', '侠', '》', '制', '作', '团', '队', '成', '员', '感', '染', '了', '新', '冠', ',', '并', '简', '短', '确', '认', '了', '拍', '摄', '暂', '停', '一', '事', ',', '按', '惯', '例', '这', '份', '声', '明', '没', '有', '透', '露', '感', '染', '者', '身', '份', ',', '只', '表', '示', '其', '按', '规', '定', '在', '隔', '离', '中', '。'], ['而', '又', '是', '两', '小', '时', '后', ',', '《', '名', '利', '场', '》', '称', '另', '有', '高', '层', '消', '息', '源', '称', '是', '帕', '丁', '森', '新', '冠', '检', '测', '阳', '性', '。', '他', '的', '代', '理', '人', '尚', '未', '就', '此', '报', '道', '做', '出', '回', '复', '。']]
['新', '浪', '娱', '乐', '讯', ' ', '北', '京', '时', '间', '9', '月', '4', '日', '消', '息', ',', '据', '《', '名', '利', '场', '》', '报', '道', '称', ',', '罗', '伯', '特', '·', '帕', '丁', '森', '确', '诊', '新', '冠', '阳', '性', ',', '他', '主', '演', '的', '新', '《', '蝙', '蝠', '侠', '》', '电', '影', '拍', '摄', '也', '暂', '停', '。', '<end>', '不', '久', '前', ',', '《', '每', '日', '邮', '报', '》', '曝', '出', '该', '片', '有', '一', '名', '剧', '组', '人', '员', '感', '染', '新', '冠', ',', '刚', '在', '英', '国', '复', '工', '几', '天', '的', '影', '片', '拍', '摄', '也', '因', '此', '暂', '停', '(', '但', '报', '道', '用', '的', '是', 'c', 'r', 'e', 'w', ',', '而', '非', 'c', 'a', 's', 't', ',', '即', '是', '指', '幕', '后', '工', '作', '人', '员', '而', '非', '演', '员', ')', '。', '两', '小', '时', '后', ',', '华', '纳', '确', '认', '有', '一', '名', '《', '蝙', '蝠', '侠', '》', '制', '作', '团', '队', '成', '员', '感', '染', '了', '新', '冠', ',', '并', '简', '短', '确', '认', '了', '拍', '摄', '暂', '停', '一', '事', ',', '按', '惯', '例', '这', '份', '声', '明', '没', '有', '透', '露', '感', '染', '者', '身', '份', ',', '只', '表', '示', '其', '按', '规', '定', '在', '隔', '离', '中', '。', '<end>', '而', '又', '是', '两', '小', '时', '后', ',', '《', '名', '利', '场', '》', '称', '另', '有', '高', '层', '消', '息', '源', '称', '是', '帕', '丁', '森', '新', '冠', '检', '测', '阳', '性', '。', '他', '的', '代', '理', '人', '尚', '未', '就', '此', '报', '道', '做', '出', '回', '复', '。', '<end>']