把一个csv文件划分成不同csv文件的存档,可以把train划分为train和val
参考链接:python将大csv文件划分成小csv文件做训练集和测试集
对参考代码做了一些修改,更适用与中文文本分类
import csv
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
def read_data1():
data = pd.read_csv('D:/Python_practice_for_DL/bert base Chinese 练习/JDcontents_vivo.csv', encoding="gbk")
x_list = data['评论']
y_list = data['评分']
return np.array(x_list), np.array(y_list)
def split_data(data_list, y_list, ratio=0.20): # 80%训练集,20%测试集
'''
按照指定的比例,划分样本数据集
ratio: 测试数据的比率
'''
X_train, X_test, y_train, y_test = train_test_split(data_list, y_list, test_size=ratio, random_state=50)
"""训练集"""
with open('D:/Python_practice_for_DL/bert base Chinese 练习/data/sub_train.csv', 'w', encoding="utf_8_sig", newline="",
errors="ignore") as csvfile: # 不加newline=""的话会空一行出来
fieldnames = ['评论', '打分']
write = csv.DictWriter(csvfile, fieldnames=fieldnames)
write.writeheader() # 写表头
for i in range(len(X_train)):
write.writerow({'评论': X_train[i], '打分': y_train[i]})
"""测试集"""
# 测试csv
with open('D:/Python_practice_for_DL/bert base Chinese 练习/data/sub_test.csv', 'w', encoding="utf_8_sig", newline="",
errors="ignore") as csvfile: # 不加newline=""的话会空一行出来
fieldnames = ['评论', '打分']
write = csv.DictWriter(csvfile, fieldnames=fieldnames)
write.writeheader() # 写表头
for i in range(len(X_test)):
write.writerow({'评论': X_train[i], '打分': y_train[i]})
return X_train, X_test, y_train, y_test
if __name__ == '__main__':
"""获取大文件的数据"""
x_list, y_list = read_data1()
"""划分为训练集和测试集及label文件"""
split_data(x_list, y_list)