df = pd.read_csv('data4000.csv') # 读取数据
test = pd.DataFrame() # 划分出的test集合
train = pd.DataFrame() # 剩余的train集合
tags = df['标签'].unique().tolist() # 按照该标签进行等比例抽取
for tag in tags:
for i in ['Y','N']:
# 随机选取0.2的数据
data = df[(df['标签'] == tag) & (df['是否'] == i)]
sample = data.sample(int(0.2*len(data)))
sample_index = sample.index
# 剩余数据
all_index = data.index
residue_index = all_index.difference(sample_index) # 去除sample之后剩余的数据
residue = data.loc[residue_index] # 这里要使用.loc而非.iloc
# 保存
test = pd.concat([test, sample], ignore_index=True)
train = pd.concat([train, residue], ignore_index=True)
# 保存为tab分隔的文本
test.to_csv('test.tsv',sep='\t',index=False)
train.to_csv('train.tsv',sep='\t',index=False)
>>> len(df), len(test),len(train)
(4946, 972, 3974)
pandas按照 某类别 等比例划分test集
最新推荐文章于 2023-09-07 13:46:30 发布