在机器学习中,我们通常将原始数据按照比例分割为“测试集”和“训练集”。python有现成的模块,完整代码如下:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
# 随机抽样
def read_data(data_file,label,start,test_size,random_state):
#label 分类变量所在列号,如第二列是2;start,分类用的指标开始列号;random_state,随机种子
dataset = pd.read_csv(data_file)
train_data= dataset.iloc[:,(start-1):].values
train_target = dataset.iloc[:,(label-1):label]
train_x,test_x,train_y,test_y = train_test_split(train_data,train_target,test_size=test_size,random_state=random_state)
return train_x, train_y, test_x, test_y,dataset
if __name__ == '__main__':
data_file = "/data/for_python.csv"
out_dir="/data/models_pipli