李沐kaggle房价预测-h2o代码注释

钢铁小狗侠
已于 2023-10-02 20:36:04 修改
阅读量216
点赞数
文章标签：机器学习人工智能
于 2023-10-01 22:21:36 首次发布
本文链接：https://blog.csdn.net/m0_63086198/article/details/133468480
版权
代码原网址：AutoML(Using h2o) | Kaggle
对数据集中NaN的填充方法：pandas处理NaN值的方法-CSDN博客
import numpy as np
import pandas as pd
import os
import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import pandas as pd
from pandas_profiling import ProfileReport
import seaborn as sns
from sklearn import metrics
from scipy import stats

from copy import deepcopy

import h2o



# 通过pandas获得数据集，后面通过pd的函数来对数据集进行修改
train_data = pd.read_csv('D:/深度学习/房价预测/california-house-prices/train.csv')
test_data = pd.read_csv('D:/深度学习/房价预测/california-house-prices/test.csv')
train_data.shape, test_data.shape

# .drop()是pandas中的函数，在使用drop之前必须用pandas处理数据集
train_data = train_data.drop([3674,6055,32867,34876,43398,44091,44633])

# 按列的方向将两个数据集连接在一起
data = pd.concat([train_data['Sold Price'], train_data['Listed Price']], axis=1)
# 横坐标是标价，纵坐标是卖价
fig = px.scatter(data, x='Listed Price', y='Sold Price')
fig.show()

data = pd.concat([train_data['Sold Price'], train_data['Tax assessed value']], axis=1)
# 税务评估值和售价
fig = px.scatter(data, x='Tax assessed value', y='Sold Price')
fig.show()

data = pd.concat([train_data['Sold Price'], train_data['Annual tax amount']], axis=1)
# 年度税额
fig = px.scatter(data, x='Annual tax amount', y='Sold Price')
fig.show()

data = pd.concat([train_data['Sold Price'], train_data['Last Sold Price']], axis=1)
# 上次售价
fig = px.scatter(data, x='Last Sold Price', y='Sold Price')
fig.show()

# y相当于label,drop=True是因为前面删除了一些行，现在需要重新依次排列没有断层，即重置索引
y = train_data['Sold Price'].reset_index(drop=True)    
# 获得训练集和测试集（训练集需要将售价drop掉）
train_features = train_data.drop('Sold Price', axis=1)   
test_features = test_data.copy()
# 将训练集和测试集合并为一个新的数据集，为了方便同时处理两个数据集
features = pd.concat([train_features, test_features]).reset_index(drop=True)
features.shape

# 计算bedroom的数量，如果bendroom是字符串，将其用逗号分隔为一个列表
def proc_bedroom(x):
    # 检查输入 x 是否不是 NaN 并且不是纯数字
    if not pd.isna(x) and not x.isdigit():
        # 如果是一个有效的字符串，就将其用逗号分隔成一个列表 temp
        # 例如 "2,3,Walk-in Closet,4".split(',') 将这个字符串分割成一个列表 temp，
        # 其中每个元素都是逗号分隔的子字符串。对于上述示例，temp 将是 ['2', '3', 'Walk-in Closet', '4']。'''
        temp = x.split(',')
        # 房间数量
        n = len(x.split(','))
        # 步入式衣橱不算房间，剔除
        if 'Walk-in Closet' in temp:
            n -= 1
        return n
    else:
        # 如果x是NaN或者是纯数字，返回
        return x

# 使用上述的函数对数据集中bedrooms列的每一个元素进行处理
features['Bedrooms'] = features['Bedrooms'].apply(lambda x: proc_bedroom(x))
# 将 'Bedrooms'列转换为数值型
features['Bedrooms'] = pd.to_numeric(features['Bedrooms'])
# 将 'Zip' 列转换为字符串型，确保 Zip被视为离散类别而不是连续类别
features['Zip'] = features['Zip'].astype('str')  # zip是邮政编码

# 统计每列的缺失值总数
# .isnull().sunm()将返回一个和features相同shape的数组，判断每个元素是否为Nan，并将每列的Nan数求和
# .sort_values(ascending=False)表示将每列的Nan从大到小依次排列，如果ascending=True那就是从小到大
total = features.isnull().sum().sort_values(ascending=False)
# features.isnull().count()统计每列的总数（包括缺失值和非缺失值），并从大到小一次排列
percent = ((features.isnull().sum() / features.isnull().count()) * 100).sort_values(ascending=False)
# 将 total 和 percent 在列的方向上拼接成一个新的 DataFrame，并命名为'Total'和 'Percent',有三列
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
# 重新设置索引，将默认索引转化为列，得到一个包含列名、缺失值总数和百分比的 DataFrame。
missing_data = missing_data.reset_index()
# 重命名列名，将 'index' 列名改为 'Name'。
missing_data.columns = ['Name', 'Total', 'Percent']
# 打印前十行
missing_data[:10]

# 定义一个用于处理 DataFrame 中的缺失值的函数
def handle_missing(features):
    # 需要用零填充的列
    zero_fill=['Last Sold Price','Lot','Full bathrooms','Annual tax amount','Tax assessed value','Bathrooms',
               'Bedrooms','Total interior livable area','Total spaces','Garage spaces']
     # 需要用 'None' 填充的列
    none_fill=['Last Sold On','Middle School','Appliances included','Flooring','Laundry features','Cooling features',
               'Cooling','Heating features','Heating','Elementary School','High School','Parking features','Parking','Summary']
    # 需要用最大值填充的列
    max_fill=['Middle School Score','Middle School Distance','Elementary School Score','Elementary School Distance',
              'High School Score','High School Distance']
    # 需要用众数填充的列
    mode_fill=['Year built','Region']
    
    # 使用零填充
    for c in zero_fill:
        features[c]=features[c].fillna(0)
    # 使用最大值填充
    for c in max_fill:
        features[c]=features[c].fillna(features[c].max())
    # 使用 'None' 填充
    for c in none_fill:
        features[c]=features[c].fillna('None')
    # 使用众数填充
    for c in mode_fill:
        features[c]=features[c].fillna(features[c].mode()[0])
        
    return features

features = handle_missing(features)
features.shape

# 计算数据集中数值型列的偏斜度

# 选择数据集中数据类型为'int64','float64'的列，并获取列名
numerical_columns = features.select_dtypes(include=['int64','float64']).columns
# 计算每列的偏斜度并将偏斜度按降序排列
skewed_features = features[numerical_columns].apply(lambda x: stats.skew(x)).sort_values(ascending=False)
# 创建一个 DataFrame，其中包含一列名为 'Skew value'，列值是按降序排列的偏斜度
skewness = pd.DataFrame({'Skew value' :skewed_features})
skewness.head(20)

'''
通过 Box-Cox 变换来减小数值特征的偏斜度,该做法可以使数据更加对称且接近正态分布。
对于一些统计方法和机器学习模型的性能和准确性有显著影响，因为许多模型假定数据是正态分布的。
'''
def fix_skew(features):
    """
    This function takes in a dataframe and return fixed skewed dataframe
    """
    ## Import necessary modules 
    from scipy.special import boxcox1p
    from scipy.stats import boxcox_normmax
    
    ## Getting all the data that are not of "object" type. 
    numerical_columns = features.select_dtypes(include=['int64','float64']).columns

    # Check the skew of all numerical features
    skewed_features = features[numerical_columns].apply(lambda x: stats.skew(x)).sort_values(ascending=False)
    # 提取出偏斜度的绝对值大于 0.5 的特征。
    high_skew = skewed_features[abs(skewed_features) > 0.5]
    skewed_features = high_skew.index

    # Perform the Box-Cox transformation
    for column in skewed_features:
        #  boxcox_normmax用来计算最优的 lambda 值，加上 1以避免对非正数值进行变换。
        features[column] = boxcox1p(features[column], boxcox_normmax(features[column] + 1))
        
    return features

#  处理'Garage spaces' 和 'Total spaces' 列，目的是将负数转换为零，从而可以使用 Box-Cox 变换。
def reset_zero(x):
    return max(x,0)
    
features['Garage spaces']=features['Garage spaces'].apply(lambda x: reset_zero(x))
features['Total spaces']=features['Total spaces'].apply(lambda x: reset_zero(x))
features = fix_skew(features)
features.head()

# 从整个数据集 features 中划分出训练集 x 和测试集 x_test
x = features.iloc[:len(y), :]
x_test = features.iloc[len(y):, :]
x.shape, y.shape, x_test.shape

# 将features分离，根据Variable Importances进行挑选
# 可通过h2o库中的varimp()函数来获取变量重要性，基于随机森林模型
selected=['Listed Price','Tax assessed value','Annual tax amount','Listed On','Elementary School Distance','Last Sold On',
'Zip','Total interior livable area','Last Sold Price','Lot','Year built','Bathrooms','High School Distance',
'Elementary School Score','Full bathrooms','Middle School Distance','Heating features','Bedrooms',
'Elementary School','Laundry features','Region','Middle School Score','Type',
'Total spaces','High School Score','Parking']

x=x[selected]
x_test=x_test[selected+['Id']]
x.head()

import h2o

# 启动 H2O 集群，启动前必须确保电脑中存在 Java 运行时环境 (JRE)，否则会报错
h2o.init()
# 将训练集和label整合并转换为 H2O 数据帧
hf = h2o.H2OFrame(pd.concat([x, y], axis=1))
# 将测试数据集 x_test 转换为 H2O 数据帧
x_test_hf = h2o.H2OFrame(x_test)

# 定义预测变量
predictors = hf.drop('Sold Price').columns
# 定义响应变量
response = 'Sold Price'

from h2o.automl import H2OAutoML

# stopping_metric: Specify the metric to use for early stopping. 
aml = H2OAutoML(
    max_models=50,     # 给定任务构建的最大模型数量为50
    include_algos=["XGBoost"],     # AutoML 使用XGBoost算法
    max_runtime_secs=7200,         # 最大运行时间
    stopping_metric='RMSLE',       # 均方根对数误差，当模型的性能在验证集上不再改善时，训练停止
    sort_metric='RMSLE'            # 按照均方根对数误差对模型进行排序
)

# Train the model
aml.train(x=predictors,y=response,training_frame=hf)

# 从 AutoML 模型中获取领先模型的排行榜并将其存储在变量 lb 中
lb = aml.leaderboard; 
lb

# 获取 lb 中排名第一的模型
aml.leader
# 使用该模型会造成较为严重的过拟合现象，考虑使用前K个模型的输出取均值进整合

# 加载用于提交的样本文件
submission_results = pd.read_csv("'D:/深度学习/房价预测/california-house-prices/sample_submission.csv")
# 该函数对 AutoML上排名前 k 个模型进行预测，并计算它们的平均值。
def top_k_avg_predict(k,leaderboard):
    # 将 H2O leaderboard 转换为 Pandas DataFrame
    lb=leaderboard.as_data_frame()
    # 选择提交文件中的第二列
    ans=submission_results.iloc[:, 1]
    # 对前 k 个模型进行循环
    for i in range(k):
        # 获取模型 ID
        model=lb.loc[i]['model_id']
        # 使用上述模型对测试集进行预测
        pred=h2o.get_model(model).predict(x_test_hf)
        # 将预测结果转换为 Pandas DataFrame。
        pred=pred.as_data_frame()
        # 获取测试结果，除 k 是因为求 k 个模型输出的均值，指数化是因为之前对目标变量进行了对数化，这里要还原
        ans+=np.expm1(pred['predict'])/k
    return ans

# 使用前 8 个模型
submission_results.iloc[:, 1]=top_k_avg_predict(8,aml.leaderboard)
# 将生成的提交文件保存为 submission.csv，并设置 index=False 以避免保存索引列
submission_results.to_csv('submission.csv', index=False)