之前数据特征太多,独热编码时内存不够,想着分块处理。但我选出一些关键特征后就没问题了,以下方法没有实际用过,写完后在这备忘下
from sklearn.model_selection import train_test_split
# 设置随机种子
SEED = 0
np.random.seed(SEED)
def get_train_test(chunk, test_size=0.95, SEED=SEED):
"""划分训练集测试集"""
y = 1 * (chunk['target'] == "REP") # 映射{"REP": 1, "DEM": 0}
X = chunk.drop(["target"], axis=1)
X = pd.get_dummies(X, sparse=True) # 独热编码,内存要求高
return train_test_split(X, y, test_size=test_size, random_state=SEED)
# 分块处理数据
df_reader = pd.read_csv(r'...\data\data.csv', iterator=True)
xtrain_chunks, xtest_chunks, ytrain_chunks, ytest_chunks = [], [], [], []
while True:
try:
chunk = df_reader.get_chunk(5000)
# 去掉整列相同的数据
chunk .loc[:, (chunk != chunk .iloc[0]).any()]
xtrain, xtest, ytrain, ytest = get_train_test(chunk)
xtrain_chunks.append(xtrain)
xtest_chunks.append(xtest)
ytrain_chunks.append(ytrain)
ytest_chunks.append(ytest)
print('iteration: %d' % len(xtrain_chunks))
except:
print('Finished')
break
xtrain = pd.concat(xtrain_chunks, ignore_index=True)
xtest = pd.concat(xtest_chunks, ignore_index=True)
ytrain = pd.concat(ytrain_chunks, ignore_index=True)
ytest = pd.concat(ytest_chunks, ignore_index=True)