承接-数据分析之测试集划分https://blog.csdn.net/qq_45626019/article/details/108060392
导入数据
import pandas as pd
housing=pd.read_csv(r"D:\sublime\机器学习\dataset\housing.csv")
#按照收入类别进行分层抽样
housing["income_cat"]=np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)
from sklearn.model_selection import StratifiedShuffleSplit
ss=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in ss.split(housing,housing["income_cat"]):
strat_train_set=housing.iloc[train_index]
strat_test_set=housing.iloc[test_index]
print(housing['income_cat'].value_counts()/len