excel 数据划分测试集和训练集
速度很快,在大量数据下仍然能够保持较高的速度,觉得不错的,请帮忙点个赞!
代码
// An highlighted block
import random
import xlrd
import numpy as np
from xlutils.copy import copy
data_path = 'C:\\Users\\gj7520\\Desktop\\pythob_files\\file_select\\data2\\train10000_2.xls'
train_file = 'C:\\Users\\gj7520\\Desktop\\pythob_files\\file_select\\data2\\train_split14000.xls'
test_file = 'C:\\Users\\gj7520\\Desktop\\pythob_files\\file_select\\data2\\test_split14000.xls'
def write_excel_xls_append(path, value):
index = len(value) # 获取需要写入数据的行数
workbook = xlrd.open_workbook(path) # 打开工作簿
sheets = workbook.sheet_names() # 获取工作簿中的所有表格
worksheet = workbook.sheet_by_name(sheets[0]) # 获取工作簿中所有表格中的的第一个表格
rows_old = worksheet.nrows # 获取表格中已存在的数据的行数
new_workbook = copy(workbook) # 将xlrd对象拷贝转化为xlwt对象
new_worksheet = new_workbook.get_sheet(0) # 获取转化后工作簿中的第一个表格
for i in range(0, index):
for j in range(0, len(value[i])):
new_worksheet.write(i+rows_old, j, value[i][j]) # 追加写入数据,注意是从i+rows_old行开始写入
new_workbook.save(path) # 保存工作簿
data = xlrd.open_workbook(data_path)
sheet = data.sheet_by_index(0)
list = []
for i in range(sheet.nrows):
list.append(i)
test_rows = random.sample(list,int(sheet.nrows * 0.1))
for item in test_rows:
list.remove(item)
train_rows = list
mat_train = np.zeros([len(train_rows) ,sheet.ncols])
mat_test = np.zeros([len(test_rows) ,sheet.ncols])
print(mat_train.shape)
print(mat_test.shape)
num_test = 0
num_train = 0
for row in range(0,sheet.nrows):
if row in test_rows:
mat_test[num_test] = sheet.row_values(row)
num_test += 1
else:
mat_train[num_train] = sheet.row_values(row)
num_train += 1
write_excel_xls_append(train_file, mat_train)
write_excel_xls_append(test_file, mat_test)