下载数据集
from datasets import load_dataset
ds = load_dataset(path="glue", name="sst2")
print(ds)
输出结果
DatasetDict({
train: Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 67349
})
validation: Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 872
})
test: Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 1821
})
})
排序数据
from datasets import load_dataset
ds = load_dataset(path="glue", name="sst2")
t_ds = ds["train"]
# 排序之前
print(t_ds["label"][:10])
# [0, 0, 1, 0, 0, 0, 1, 1, 0, 1]
# 排序之后
sorted = t_ds.sort("label")
print(sorted["label"][:10])
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
print(sorted["label"][-10:])
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
打乱数据
from datasets import load_dataset
ds = load_dataset(path="glue", name="sst2")
t_ds = ds["train"]
# 打乱之前
print(t_ds["label"][:10])
# [0, 0, 1, 0, 0, 0, 1, 1, 0, 1]
# 打乱之后
shuffled = t_ds.shuffle(seed=1)
print(shuffled["label"][:10])
# [1, 1, 1, 1, 0, 1, 1, 1, 0, 0]
切分数据
ds = load_dataset(path="glue", name="sst2")
t_ds = ds["train"]
# 全部的数据
print(t_ds)
splited = t_ds.train_test_split(test_size=0.1)
# 切分为训练和测试集
print(splited)
输出
DatasetDict({
train: Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 60614
})
test: Dataset({
features: ['sentence', 'label', 'idx'],
num_rows: 6735
})
})
是用 Map函数处理数据
from datasets import load_dataset
dataset = load_dataset("glue", "mrpc", split="train")
small_dataset = dataset.select([0, 10, 20, 30, 40, 50])
def add_prefix(example):
example["sentence1"] = 'My sentence: ' + example["sentence1"]
return example
updated_dataset = small_dataset.map(add_prefix)
print(updated_dataset["sentence1"][0])
#My sentence: Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .
Github
https://github.com/nowang6/huggingface