定义一个方法,可以直接将全部的数据集划分为训练集与测试集
def nlp_split(path,size=0.3,sep = '__label__'):
from sklearn.model_selection import train_test_split
label_list = []
text_list = []
with open(path,'r',encoding = 'utf8') as file:
for line in file:
label = line.split(sep)[1]
text = line.split(sep)[0].strip('\t')
label_list.append(label)
text_list.append(text)
X_train, X_test, y_train, y_test = train_test_split(text_list, label_list, test_size=size, random_state=42)
with open('train.txt','a',encoding = 'utf8') as file:
for i in range(len(X_train)):
train_x = X_train[i]
train_y = y_train[i]
text = train_x + '\t'+sep+train_y
file.write(text)
with open('test.txt','a',encoding = 'utf8') as file:
for i in range(len(X_test)):
test_x = X_test[i]
test_y = y_test[i]
text = test_x + '\t'+sep+test_y
file.write(text)
其中
- path为数据集路径
- size为划分比例
- sep为分隔符标记