1.np.random.permutation法
import numpy as np
def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data))
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices]
train_set, test_set = split_train_test(housing, 0.2)
print('len_train_set:', len(train_set))
print('len_test_set:', len(test_set))
#输出
len_train_set: 16512
len_test_set: 4128
缺点:
a.数据集分割在变化
b.使用np.random.seed(42)可以保持数据集的分隔,但是数据集变化后,分割变化
2.crc32法
from zlib import crc32
def test_set_check(identifier, test_ratio):
return crc32(np.int64(identifier)) & 0xffffffff < test_ratio * 2**32
def split_train_test_by_id(data, test_ratio,