Gitee:https://gitee.com/Shine_Zhang/machine_learning_practices
参考:
- https://zhuanlan.zhihu.com/p/88797654
- 《机器学习实战:基于Scikit-Learn、Keras和TensorFlow第2版》
加载数据,读取CSV文件
import pandas as pd
import os
HOUSING_PATH = 'datasets/housing'
def load_housing_data(housing_path=HOUSING_PATH):
csv_path = os.path.join(housing_path, 'housing.csv')
return pd.read_csv(csv_path)
housing = load_housing_data(HOUSING_PATH)
print(housing.head())
longitude latitude housing_median_age total_rooms total_bedrooms \
0 -122.23 37.88 41.0 880.0 129.0
1 -122.22 37.86 21.0 7099.0 1106.0
2 -122.24 37.85 52.0 1467.0 190.0
3 -122.25 37.85 52.0 1274.0 235.0
4 -122.25 37.85 52.0 1627.0 280.0
population households median_income median_house_value ocean_proximity
0 322.0 126.0 8.3252 452600.0 NEAR BAY
1 2401.0 1138.0 8.3014 358500.0 NEAR BAY
2 496.0 177.0 7.2574 352100.0 NEAR BAY
3 558.0 219.0 5.6431 341300.0 NEAR BAY
4 565.0 259.0 3.8462 342200.0 NEAR BAY
housing.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 longitude 20640 non-null float64
1 latitude 20640 non-null float64
2 housing_median_age 20640 non-null float64
3 total_rooms 20640 non-null float64
4 total_bedrooms 20433 non-null float64
5 population 20640 non-null float64
6 households 20640 non-null float64
7 median_income 20640 non-null float64
8 median_house_value 20640 non-null float64
9 ocean_proximity 20640 non-null object
dtypes: float64(9), object(1)
memory usage: 1.6+ MB
print(housing.describe())
longitude latitude housing_median_age total_rooms \
count 20640.000000 20640.000000 20640.000000 20640.000000
mean -119.569704 35.631861 28.639486 2635.763081
std 2.003532 2.135952 12.585558 2181.615252
min -124.350000 32.540000 1.000000 2.000000
25% -121.800000 33.930000 18.000000 1447.750000
50% -118.490000 34.260000 29.000000 2127.000000
75% -118.010000 37.710000 37.000000 3148.000000
max -114.310000 41.950000 52.000000 39320.000000
total_bedrooms population households median_income \
count 20433.000000 20640.000000 20640.000000 20640.000000
mean 537.870553 1425.476744 499.539680 3.870671
std 421.385070 1132.462122 382.329753 1.899822
min 1.000000 3.000000 1.000000 0.499900
25% 296.000000 787.000000 280.000000 2.563400
50% 435.000000 1166.000000 409.000000 3.534800
75% 647.000000 1725.000000 605.000000 4.743250
max 6445.000000 35682.000000 6082.000000 15.000100
median_house_value
count 20640.000000
mean 206855.816909
std 115395.615874
min 14999.000000
25% 119600.000000
50% 179700.000000
75% 264725.000000
max 500001.000000
%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20, 15))
plt.show()
创建训练集/测试集
# 并不完美,每次产生一个不同的数据集
import numpy as np
def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data)) # 随机返回一个序列
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices] # iloc 行索引
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), len(test_set))
16512 4128
# 每次产生一个相同的数据集
import numpy as np
np.random.seed(42) # 使后面的随机数按固定的顺序生成 https://www.cnblogs.com/subic/p/8454025.html
def split_train_test(data, test_ratio):
shuffled_indices = np.random.permutation(len(data)) # 随机返回一个序列
test_set_size = int(len(data) * test_ratio)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]
return data.iloc[train_indices], data.iloc[test_indices] # iloc 行索引
train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), len(test_set))
16512 4128
# 以上两种解决方案在下一次获取更新的数据时都会中断,当然了,若数据集大小固定时则没问题。为了即使在更新数据集之后也有一个稳定的训练测试分割,常见的解决方案
# 是每个实例都使用一个标识符来决定是否进入测试集(假定每个实例都有一个唯一且不变的标识符)。
from zlib import crc32
def test_set_check(identifier, test_ratio):
crc = crc32(np.int64(identifier)) & 0xffffffff # crc32 返回循环冗余校验值 :https://blog.csdn.net/u011377996/article/details/79360820
return crc < test_ratio * 2 ** 32 # 哈希值小于或等于最大哈希值的20%(test_ratio=0.2)
def split_train_test_by_id(data, test_ratio, id_column):
ids = data[id_column] # 两列相同的索引
in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio)) # apply()用法:https://blog.csdn.net/stone0823/article/details/100008619
return data.loc[~in_test_set], data.loc[in_test_set] #loc 列索引: https://blog.csdn.net/u014712482/article/details/85080864
housing_with_id = housing.reset_index() # 在原有的一列索引前增加一列'index’索引
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'index')
print(len(train_set), len(test_set))
16512 4128
# 采用经纬度作为唯一标识符ID
housing_with_id['id'] = housing['longitude'] * 1000 + housing['latitude']
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'id')
print(len(train_set), len(test_set))
16322 4318
# Scikit-Learn提供的创建测试集函数
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(len(train_set), len(test_set))
16512 4128
# 数据分层 这里按照收入中位数分层
housing["income_cat"] = pd.cut(housing["median_income"],
bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
labels=[1, 2, 3, 4, 5])
housing["income_cat"].hist()
plt.show()
# 分层抽样
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,
random_state=42) # n_splits是将训练数据分成train/test对的组数,可根据需要进行设置,默认为10
for train_index, test_index in split.split(housing, housing["income_cat"]): # 生成索引以将数据拆分为训练集和测试集
strat_train_set = housing.loc[train_index]
strat_test_set = housing.loc[test_index]
print(len(strat_train_set), len(strat_test_set))
16512 4128
# 分层抽样测试集中收入类别比例分布
print(strat_test_set["income_cat"].value_counts()/len(strat_test_set)) # value_counts():查看表格某列中有多少个不同值的快捷方法,并计算每个不同值有在该列中有多少重复值。
# 纯随机抽样测试集中收入类别比例分布
print(test_set["income_cat"].value_counts()/len(test_set))
# 完整数据集中的收入类别分布
print(housing["income_cat"].value_counts()/len(housing))
# 分层抽样的测试集中的比例分布与完整数据集中的分布几乎一致,而纯随机抽样的测试集结果则是有偏的。
3 0.350533
2 0.318798
4 0.176357
5 0.114583
1 0.039729
Name: income_cat, dtype: float64
3 0.358527
2 0.324370
4 0.167393
5 0.109496
1 0.040213
Name: income_cat, dtype: float64
3 0.350581
2 0.318847
4 0.176308
5 0.114438
1 0.039826
Name: income_cat, dtype: float64