机器学习实战笔记【2-3】

最新推荐文章于 2024-03-10 14:55:51 发布

Shine.Zhang

最新推荐文章于 2024-03-10 14:55:51 发布

阅读量189

点赞数 1

分类专栏：日常记录 & 笔记

本文链接：https://blog.csdn.net/qq_32939413/article/details/117047602

版权

日常记录 & 笔记专栏收录该内容

35 篇文章 2 订阅

订阅专栏

Gitee:https://gitee.com/Shine_Zhang/machine_learning_practices

参考：

https://zhuanlan.zhihu.com/p/88797654
《机器学习实战：基于Scikit-Learn、Keras和TensorFlow第2版》

加载数据，读取CSV文件

import pandas as pd
import os

HOUSING_PATH = 'datasets/housing'


def load_housing_data(housing_path=HOUSING_PATH):
    csv_path = os.path.join(housing_path, 'housing.csv')
    return pd.read_csv(csv_path)



housing = load_housing_data(HOUSING_PATH)
print(housing.head())

   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY

housing.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB

print(housing.describe())

          longitude      latitude  housing_median_age   total_rooms  \
count  20640.000000  20640.000000        20640.000000  20640.000000   
mean    -119.569704     35.631861           28.639486   2635.763081   
std        2.003532      2.135952           12.585558   2181.615252   
min     -124.350000     32.540000            1.000000      2.000000   
25%     -121.800000     33.930000           18.000000   1447.750000   
50%     -118.490000     34.260000           29.000000   2127.000000   
75%     -118.010000     37.710000           37.000000   3148.000000   
max     -114.310000     41.950000           52.000000  39320.000000   

       total_bedrooms    population    households  median_income  \
count    20433.000000  20640.000000  20640.000000   20640.000000   
mean       537.870553   1425.476744    499.539680       3.870671   
std        421.385070   1132.462122    382.329753       1.899822   
min          1.000000      3.000000      1.000000       0.499900   
25%        296.000000    787.000000    280.000000       2.563400   
50%        435.000000   1166.000000    409.000000       3.534800   
75%        647.000000   1725.000000    605.000000       4.743250   
max       6445.000000  35682.000000   6082.000000      15.000100   

       median_house_value  
count        20640.000000  
mean        206855.816909  
std         115395.615874  
min          14999.000000  
25%         119600.000000  
50%         179700.000000  
75%         264725.000000  
max         500001.000000

%matplotlib inline
import matplotlib.pyplot as plt
housing.hist(bins=50, figsize=(20, 15))
plt.show()

在这里插入图片描述

创建训练集/测试集

# 并不完美，每次产生一个不同的数据集
import numpy as np
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data)) # 随机返回一个序列
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]  # iloc 行索引


train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), len(test_set))

16512 4128

# 每次产生一个相同的数据集
import numpy as np
np.random.seed(42) # 使后面的随机数按固定的顺序生成 https://www.cnblogs.com/subic/p/8454025.html
def split_train_test(data, test_ratio):
    shuffled_indices = np.random.permutation(len(data)) # 随机返回一个序列
    test_set_size = int(len(data) * test_ratio)
    test_indices = shuffled_indices[:test_set_size]
    train_indices = shuffled_indices[test_set_size:]
    return data.iloc[train_indices], data.iloc[test_indices]  # iloc 行索引


train_set, test_set = split_train_test(housing, 0.2)
print(len(train_set), len(test_set))

16512 4128

#　以上两种解决方案在下一次获取更新的数据时都会中断,当然了，若数据集大小固定时则没问题。为了即使在更新数据集之后也有一个稳定的训练测试分割，常见的解决方案
#  是每个实例都使用一个标识符来决定是否进入测试集（假定每个实例都有一个唯一且不变的标识符）。
from zlib import crc32
def test_set_check(identifier, test_ratio):
    crc = crc32(np.int64(identifier)) & 0xffffffff # crc32 返回循环冗余校验值 :https://blog.csdn.net/u011377996/article/details/79360820
    return crc < test_ratio * 2 ** 32 # 哈希值小于或等于最大哈希值的20%(test_ratio=0.2)


def split_train_test_by_id(data, test_ratio, id_column):
    ids = data[id_column] # 两列相同的索引
    in_test_set = ids.apply(lambda id_: test_set_check(id_, test_ratio)) # apply()用法:https://blog.csdn.net/stone0823/article/details/100008619
    return data.loc[~in_test_set], data.loc[in_test_set] #loc 列索引: https://blog.csdn.net/u014712482/article/details/85080864

housing_with_id = housing.reset_index() # 在原有的一列索引前增加一列'index’索引
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'index')
print(len(train_set), len(test_set))

16512 4128

# 采用经纬度作为唯一标识符ID
housing_with_id['id'] = housing['longitude'] * 1000 + housing['latitude']
train_set, test_set = split_train_test_by_id(housing_with_id, 0.2, 'id')
print(len(train_set), len(test_set))

16322 4318

# Scikit-Learn提供的创建测试集函数
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(housing, test_size=0.2, random_state=42)
print(len(train_set), len(test_set))

16512 4128

# 数据分层 这里按照收入中位数分层
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])
housing["income_cat"].hist()
plt.show()
# 分层抽样
from sklearn.model_selection import StratifiedShuffleSplit
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2,
                               random_state=42)  # n_splits是将训练数据分成train/test对的组数，可根据需要进行设置，默认为10

for train_index, test_index in split.split(housing, housing["income_cat"]):  # 生成索引以将数据拆分为训练集和测试集
    strat_train_set = housing.loc[train_index]
    strat_test_set = housing.loc[test_index]
    print(len(strat_train_set), len(strat_test_set))

在这里插入图片描述

16512 4128

# 分层抽样测试集中收入类别比例分布
print(strat_test_set["income_cat"].value_counts()/len(strat_test_set)) # value_counts():查看表格某列中有多少个不同值的快捷方法，并计算每个不同值有在该列中有多少重复值。
# 纯随机抽样测试集中收入类别比例分布
print(test_set["income_cat"].value_counts()/len(test_set)) 
# 完整数据集中的收入类别分布
print(housing["income_cat"].value_counts()/len(housing)) 
# 分层抽样的测试集中的比例分布与完整数据集中的分布几乎一致，而纯随机抽样的测试集结果则是有偏的。

3    0.350533
2    0.318798
4    0.176357
5    0.114583
1    0.039729
Name: income_cat, dtype: float64
3    0.358527
2    0.324370
4    0.167393
5    0.109496
1    0.040213
Name: income_cat, dtype: float64
3    0.350581
2    0.318847
4    0.176308
5    0.114438
1    0.039826
Name: income_cat, dtype: float64

Shine.Zhang

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
打赏
0
评论
机器学习实战笔记【2-3】

加载数据，读取CSV文件import pandas as pdimport osHOUSING_PATH = 'datasets/housing'def load_housing_data(housing_path=HOUSING_PATH): csv_path = os.path.join(housing_path, 'housing.csv') return pd.read_csv(csv_path)housing = load_housing_data(HOUSI
复制链接

扫一扫