例如划分训练集、测试集、验证集比例为 6 : 2 : 2 6: 2: 2 6:2:2, 代码如下:
import pandas as pd
"""
此脚本将数据集分为训练集和测试集和验证集
"""
if __name__ == "__main__":
data = pd.read_csv("corpus-model-filtered.csv", header=None) #加载数据
data:pd.DataFrame = data.sample(frac=1.0) #将数据打乱
rows, cols = data.shape
split_index_1 = int(rows * 0.2)
split_index_2 = int(rows * 0.4)
#数据分割
data_test:pd.DataFrame = data.iloc[0: split_index_1, :]
data_validate:pd.DataFrame = data.iloc[split_index_1:split_index_2, :]
data_train:pd.DataFrame = data.iloc[split_index_2: rows, :]
#数据保存
data_test.to_csv("corpus-model-filtered-test.csv", header=None, index=False)
data_validate.to_csv("corpus-model-filtered-validate.csv", header=None, index=False)
data_train.to_csv("corpus-model-filtered-train.csv", header=None, index=False)
print("划分完毕")
希望对读者有帮助。