Gitee:https://gitee.com/Shine_Zhang/machine_learning_practices
参考:
- https://zhuanlan.zhihu.com/p/88797654
- 《机器学习实战:基于Scikit-Learn、Keras和TensorFlow第2版》
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from zlib import crc32
HOUSING_PATH = 'datasets/housing'
def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, 'housing.csv')
return pd.read_csv(csv_path)
housing = load_housing_data(HOUSING_PATH)
# 数据分层 这里按照收入中位数分层
housing["income_cat"] = pd.cut(housing["median_income"],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
housing["income_cat"].hist()
plt.show()
# 分层抽样
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,
random_state=42) # n_splits是将训练数据分成train/test对的组数,可根据需要进行设置,默认为10
strat_test_set, strat_train_set = None, None
for train_index, test_index in split.split(housing, housing["income_cat"]): # 生成索引以将数据拆分为训练集和测试集
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
strat_train_set.info()
# 删除"income_cat"属性,恢复数据
for set_ in (strat_train_set, strat_test_set):
set_.drop("income_cat", axis=1, inplace=True)
strat_train_set.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 11 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 16512 non-null float64
1 latitude 16512 non-null float64
2 housing_median_age 16512 non-null float64
3 total_rooms 16512 non-null float64
4 total_bedrooms 16354 non-null float64
5 population 16512 non-null float64
6 households 16512 non-null float64
7 median_income 16512 non-null float64
8 median_house_value 16512 non-null float64
9 ocean_proximity 16512 non-null object
10 income_cat 16512 non-null category
dtypes: category(1), float64(9), object(1)
memory usage: 1.4+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 16512 entries, 17606 to 15775
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 16512 non-null float64
1 latitude 16512 non-null float64
2 housing_median_age 16512 non-null float64
3 total_rooms 16512 non-null float64
4 total_bedrooms 16354 non-null float64
5 population 16512 non-null float64
6 households 16512 non-null float64
7 median_income 16512 non-null float64
8 median_house_value 16512 non-null float64
9 ocean_proximity 16512 non-null object
dtypes: float64(9), object(1)
memory usage: 1.4+ MB
housing = strat_train_set.copy() # 创建副本
# 地理数据可视化
housing.plot(kind="scatter", x="longitude", y='latitude', alpha=1) # alpha: 透明度0-1
housing.plot(kind="scatter", x="longitude", y='latitude', alpha=0.1)
plt.show()
# 房价可视化
housing.plot(kind="scatter", x="longitude", y='latitude', alpha=0.4, s=housing["population"] / 100, label="population",
figsize=(20, 14),c="median_house_value",cmap=plt.get_cmap("jet"),colorbar=True)
plt.legend() # 给图像加上图例、
plt.show()
# 每对属性之间的标准相关系数(也称为皮尔逊r)
corr_matrix = housing.corr()
print(corr_matrix)
longitude latitude housing_median_age total_rooms \
longitude 1.000000 -0.924478 -0.105848 0.048871
latitude -0.924478 1.000000 0.005766 -0.039184
housing_median_age -0.105848 0.005766 1.000000 -0.364509
total_rooms 0.048871 -0.039184 -0.364509 1.000000
total_bedrooms 0.076598 -0.072419 -0.325047 0.929379
population 0.108030 -0.115222 -0.298710 0.855109
households 0.063070 -0.077647 -0.306428 0.918392
median_income -0.019583 -0.075205 -0.111360 0.200087
median_house_value -0.047432 -0.142724 0.114110 0.135097
total_bedrooms population households median_income \
longitude 0.076598 0.108030 0.063070 -0.019583
latitude -0.072419 -0.115222 -0.077647 -0.075205
housing_median_age -0.325047 -0.298710 -0.306428 -0.111360
total_rooms 0.929379 0.855109 0.918392 0.200087
total_bedrooms 1.000000 0.876320 0.980170 -0.009740
population 0.876320 1.000000 0.904637 0.002380
households 0.980170 0.904637 1.000000 0.010781
median_income -0.009740 0.002380 0.010781 1.000000
median_house_value 0.047689 -0.026920 0.064506 0.687160
median_house_value
longitude -0.047432
latitude -0.142724
housing_median_age 0.114110
total_rooms 0.135097
total_bedrooms 0.047689
population -0.026920
households 0.064506
median_income 0.687160
median_house_value 1.000000
# 属性相关性 pandas的scatter_matrix函数 详情参考pandas文档
from pandas.plotting import scatter_matrix
attributes = ["median_house_value", "median_income", "total_rooms", "housing_median_age"]
scatter_matrix(housing[attributes], figsize=(21, 14))
# 中位数相关性
housing.plot(kind="scatter", x="median_income", y="median_house_value", alpha=0.1)
plt.show()
# 一些新属性
housing["rooms_per_household"] = housing["total_rooms"]/housing["households"]
housing["bedrooms_per_room"] = housing["total_bedrooms"]/housing["total_rooms"]
housing["population_per_household"]=housing["population"]/housing["households"]
corr_matrix = housing.corr()
print(corr_matrix["median_house_value"].sort_values(ascending=False)) # sort_values(ascending=False) 升序排列
median_house_value 1.000000
median_income 0.687160
rooms_per_household 0.146285
total_rooms 0.135097
housing_median_age 0.114110
households 0.064506
total_bedrooms 0.047689
population_per_household -0.021985
population -0.026920
longitude -0.047432
latitude -0.142724
bedrooms_per_room -0.259984
Name: median_house_value, dtype: float64