CSV文件拆分情况。。。
1.要注意读取的分隔符
2.保存时要注意保存的分隔符
分隔符坑很多,有的是“,”, 有的是空格,有的是tab,不一而足。。。。。。
import os
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
def rename():
for (root, dirs, files) in os.walk("tianchi_datasets", topdown=False):
for name in files:
new_name = os.path.join(root, root.split('/')[-1] + '_' + name)
print(os.path.join(root, "\t", name), new_name)
os.rename(os.path.join(root, name), new_name)
def break_data(target, rate=0.2):
for (root, dirs, files) in os.walk(target, topdown=False):
for name in files:
if "_" in name:
print(name)
origin_filename = os.path.join(root, name)
origin_dataset = pd.read_csv(origin_filename, delimiter="\t", header=None) # 加入参数
train_data, test_data = train_test_split(origin_dataset, test_size=rate)
train_filename = os.path.join(root, 'train.csv')
dev_filename = os.path.join(root, 'dev.csv')
train_data.to_csv(train_filename, index=False, sep="\t", header=None)
test_data.to_csv(dev_filename, index=False, sep="\t", header=None)
if __name__ == '__main__':
# rename()
break_data(target='datasets', rate=0.2)