import pandas as pd
from sklearn.model_selection import train_test_split
def split_data(data_df, filepath1, filepath2):
X_train, X_val, y_train, y_val = train_test_split(data_df['text'], data_df['label'], test_size=0.15, random_state=2022)
new_train_df = pd.DataFrame()
new_train_df['text'] = X_train
new_train_df['label'] = y_train
new_test_df = pd.DataFrame()
new_test_df['text'] = X_val
new_test_df['label'] = y_val
new_train_df.to_csv(filepath1.format(len(new_train_df)), index=False)
new_test_df.to_csv(filepath2.format(len(new_test_df)), index=False)
print('yeah.', filepath1.format(len(new_train_df)), filepath2.format(len(new_test_df)))
print('done')
data_df = pd.read_csv('lic_event.csv')
split_data(data_df, 'train_lic_event.csv', 'test_lic_event.csv')
当进行模型训练时,可能需要手动把一份文件一分为二的情况,可使用上述方法;
此外,还有,如果需要按照label进行拆分时,可修改上述方法第一行如下:
X_train, X_val, y_train, y_val = train_test_split(data_df['text'], data_df['label'], test_size=0.2, random_state=2022, stratify=data_df['label'])
但是,需注意,此方法要求label中每个标签个数至少为2;