import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
housing=pd.read_csv('D:\\pythondata\\housing.csv')
housing.plot(kind="scatter", x="longitude", y="latitude")
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.1)
housing.plot(kind="scatter", x="longitude", y="latitude", alpha=0.4,
s=housing["population"]/100, label="population",
c="median_house_value", cmap=plt.get_cmap("jet"), colorbar=True,
)
plt.legend()
housing["income_cat"] = np.ceil(housing["median_income"] / 1.5)
housing["income_cat"].where(housing["income_cat"] < 5, 5.0, inplace=True)
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=
《Hands-on Machine Learning with Scikit-Learn and TensorFlow》 读书笔记
最新推荐文章于 2024-06-02 21:59:58 发布
这篇博客是《Hands-on Machine Learning with Scikit-Learn and TensorFlow》的读书笔记,重点介绍了数据预处理的步骤。内容包括读取CSV数据,绘制散点图,进行数据分割,处理缺失值(如使用中位数填充),对分类变量进行编码,以及使用Pipeline和FeatureUnion结合连续和分类特征进行标准化。预处理过程中涉及到的数据集包括地理位置、人口、房价等属性。
摘要由CSDN通过智能技术生成