通常,我们在建模之前会对原始数据进行切分,现对该方法进行打包。
#数据切分, 按照7:3切分训练集与测试集
## data_df:原始数据
## object_col:目标变量
def data_split(data_df , object_col):
from sklearn.model_selection import train_test_split
Y = data_df[object_col]
X = data_df.drop(object_col , axis = 1)
#测试集占比30%
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=0)
# print(Y_train)
train = pd.concat([Y_train, X_train], axis=1)
test = pd.concat([Y_test, X_test], axis=1)
return X_train, X_test, Y_train, Y_test
if __name__ == '__main__':
import pandas as pd
import numpy as np
data = pd.read_csv('data/cs-training.csv')
## 切分训练集、测试集
X_train, X_test, Y_train, Y_test = data_split(data , 'SeriousDlqin2yrs')