承接数据分析之数据探索
https://blog.csdn.net/qq_45626019/article/details/108074152
import pandas as pd
import numpy as np
housing=pd.read_csv(r"D:\sublime\机器学习\dataset\housing.csv")
#按照收入类别进行分层抽样
housing["income_cat"]=np.ceil(housing["median_income"]/1.5)
housing["income_cat"].where(housing["income_cat"]<5,5.0,inplace=True)
from sklearn.model_selection import StratifiedShuffleSplit
ss=StratifiedShuffleSplit(n_splits=1,test_size=0.2,random_state=42)
for train_index,test_index in ss.split(housing,housing["income_cat"]):
strat_train_set=housing.iloc[train_index]
strat_test_set=housing.iloc[test_index]
#回到干净数据集,划分目标和特征属性(标签-对应数据)
housing=strat_train_set.drop("median_house_value",axis=1)
housing_label=strat_train_set['median_house_value'].copy()
在第一部分我们已经注意到有部分缺失数据,为适应机器学习算法,我们需要对缺失数据进行删除。
①放弃相应缺失数据
housing.dropna(subset