Python · House Prices - Advanced Regression TechniquesKaggle实战:预测房价(Kaggle版本)

课程版本在这里。

https://blog.csdn.net/pxyp123/article/details/123150394

搬运Kaggle的代码,再改进一下适合本地使用。因为它图片很好看。

话不多说,先给结果,如下图。已经接近李沐老师的结果了。

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
# 使用jupyter notebook如果缺少库一般可以直接使用如下的形式安装
!pip install seaborn
!pip install sklearn
!pip install xgboost
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import norm
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor

warnings.filterwarnings('ignore')
import hashlib
import os
import tarfile
import zipfile
import requests

#@save
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'
def download(name, cache_dir=os.path.join('..', 'data')):  #@save
    """下载一个DATA_HUB中的文件,返回本地文件名"""
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # 命中缓存
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname
def download_extract(name, folder=None):  #@save
    """下载并解压zip/tar文件"""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, '只有zip/tar文件可以被解压缩'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all():  #@save
    """下载DATA_HUB中的所有文件"""
    for name in DATA_HUB:
        download(name)
DATA_HUB['kaggle_house_train'] = (  #@save
    DATA_URL + 'kaggle_house_pred_train.csv',
    '585e9cc93e70b39160e7921475f9bcd7d31219ce')

DATA_HUB['kaggle_house_test'] = (  #@save
    DATA_URL + 'kaggle_house_pred_test.csv',
    'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')
df_train = pd.read_csv(download('kaggle_house_train'))
test_csv = pd.read_csv(download('kaggle_house_test'))
df_train.head()

结果如下: 

# Creating dataframe for numeric and categorical features separately 
numeric_df = df_train.select_dtypes(include=['int64', 'float64'])
categorical_df = df_train.select_dtypes(include=['object'])
#numeric_df.drop( axis=1, inplace=True)
corrmat = numeric_df.corr()
f, ax = plt.subplots(figsize=(30, 30))
sns.heatmap(corrmat, annot=True, cmap="YlGnBu", linewidths=0.1, annot_kws={"fontsize":10});

结果如下(图片太大只截取一部分)

#'SalePrice' correlation matrix (zoomed heatmap style)
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(numeric_df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cmap="YlGnBu", cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()
k = 10 #number of variables for heatmap
cols = corrmat.nlargest(k, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(numeric_df[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cmap="YlGnBu", cbar=True, annot=True, square=True, fmt='.2f', annot_kws={'size': 10}, yticklabels=cols.values, xticklabels=cols.values)
plt.show()

 结果如下:

#Numerical data
fig=plt.subplots(figsize=(12, 21))
i=0
for feature in numeric_df.columns:
    if feature not in ['Id', 'SalePrice']:
        i+=1
        plt.subplot(13, 3, i)
        sns.distplot(df_train[feature], color='green')
        plt.tight_layout()

部分结果如下:

fig=plt.subplots(figsize=(12, 21))
i=0
for feature in numeric_df.columns:
    if feature not in ['Id', 'SalePrice']:
        i+=1
        plt.subplot(13, 3, i)
        sns.scatterplot(df_train[feature], df_train['SalePrice'], color='green')
        plt.tight_layout()

 部分结果如下:

def spearman(frame, features):
    spr = pd.DataFrame()
    spr['feature'] = features
    spr['spearman'] = [frame[f].corr(frame['SalePrice'], 'spearman') for f in features]
    spr = spr.sort_values('spearman')
    plt.figure(figsize=(6, 0.25*len(features)))
    sns.barplot(data=spr, y='feature', x='spearman', orient='h')

features = corrmat[['SalePrice']].sort_values(['SalePrice'], ascending=False)
features = [f for f in features.index if f != 'SalePrice']

spearman(numeric_df, features)

 

#Preprocessing data
#missing data
total = df_train.isnull().sum().sort_values(ascending=False)
percent = (df_train.isnull().sum()/df_train.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

fig, ax = plt.subplots(figsize=(10,6))
deleted_data = missing_data[missing_data['Total'] >= 1]['Total'].copy()
deleted_data.sort_values(inplace=True)
ax.set_title('Missing data')
ax = deleted_data.plot.bar(color='green')

df_train = df_train.drop((missing_data[missing_data['Percent'] >= 0.15]).index,axis = 1)
df_train = df_train.drop(df_train.loc[df_train['Electrical'].isnull()].index)
if not df_train.isnull().sum().max():
    print(f'Dataset hasnt zeros')
else:
    print(f'Dataset has {df_train.isnull().sum().max()} zeros')

有如下结果:

Dataset has 81 zeros
#bivariate analysis saleprice/grlivarea
data = pd.concat([df_train['SalePrice'], df_train['GrLivArea']], axis=1)
data.plot.scatter(x='GrLivArea', y='SalePrice', alpha=0.3, ylim=(0,800000), color='green', figsize=(10,6))
plt.axvline(x=4600, color='red', ls='--')

# cloud points
df_train.sort_values(by = 'GrLivArea', ascending = False)[:2]

# deleting points
df_train = df_train.drop(df_train[df_train['Id'] == 1299].index)
df_train = df_train.drop(df_train[df_train['Id'] == 524].index)
df_train.head()

训练集和测试集分离 

#convert categorical variable into dummy
df_train = pd.get_dummies(df_train)
test_csv = pd.get_dummies(test_csv)
# 
x = df_train.drop('Id', axis=1)
y = df_train['SalePrice']

# inner merge of train and test dataset
x = x.drop(x.columns.difference(test_csv.columns), axis =1)
test_csv = test_csv.drop(test_csv.columns.difference(x.columns), axis = 1)
if len(x.columns) == len(test_csv.columns):
       print(f'Train and test dataset are equal')

x_train, x_test, y_train, y_test = train_test_split(x.to_numpy(), y.to_numpy(), test_size=0.25, random_state=42)

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
# Imputation transformer for completing missing values. (nan to mean)
x_train = imp_mean.fit_transform(x_train) 
x_test = imp_mean.fit_transform(x_test)

x_train, x_test, y_train, y_test = train_test_split(x.to_numpy(), y.to_numpy(), test_size=0.25, random_state=42)

imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
# Imputation transformer for completing missing values. (nan to mean)
x_train = imp_mean.fit_transform(x_train) 
x_test = imp_mean.fit_transform(x_test)

选择模型参数:

from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestRegressor

select = SelectFromModel(
    RandomForestRegressor(n_estimators=100), threshold = "median")

select.fit(x_train,y_train)

x_train_l1 = select.transform(x_train)

print("форма обуч набора X: {}".format(x_train.shape))
print("форма обуч набора X c l1: {}".format(x_train_l1.shape))


def DistribPlot(y_train, y_predicted):    
    
    fig = plt.figure(figsize=(10, 6))

    ax1 = sns.distplot(y_train, hist=False, color="r", label='Actual Values')
    ax2 = sns.distplot(y_predicted, hist=False, color="b", label='Predicted Values', ax=ax1)

    plt.title('Actual Values and Predicted Values')
    plt.xlabel('Price (in dollars)')
    plt.ylabel('Overall quality')
    plt.gca().legend(('Actual Values','Predicted Values'))

    plt.show()


def scoreboard(model, y_predicted): 
    print(f'Train dataset square root of {type(model).__name__} model is {model.score(x_train, y_train)}')
    print(f'Test dataset square root of {type(model).__name__} is {model.score(x_test, y_test)}')
    DistribPlot(y_train, y_predicted)
    
#Ridge
alpha = [0.1, 0.5, 1, 10]
models = {}
errors = {}
count = 0
for a in alpha:    
    ridge = Ridge(alpha=a).fit(x_train, y_train)
    predicted_data = ridge.predict(x_test)
    errors[count] = mean_absolute_error(y_test, predicted_data)
    models[count] = ridge
    count +=1
    print(f'Mean absolute error of decision three model is {mean_absolute_error(y_test, predicted_data)}')
min_error = [k for k,v in errors.items() if v == min(errors.values())]
print()
ridge = models[min_error[0]]
scoreboard(ridge, predicted_data)

#Lasso
alpha = [0.0001, 0.001, 0.01, 0.1, 1]
models = {}
errors = {}
count = 0
for a in alpha:    
    lasso = Lasso(alpha=a, max_iter=10000).fit(x_train, y_train)
    predicted_data = lasso.predict(x_test)
    errors[count] = mean_absolute_error(y_test, predicted_data)
    models[count] = lasso
    count +=1
    print(f'Mean absolute error of decision three model is {mean_absolute_error(y_test, predicted_data)}')
min_error = [k for k,v in errors.items() if v == min(errors.values())]
print()
lasso = models[min_error[0]]
scoreboard(lasso, predicted_data)

#Decision Trees
max_depth = [6, 7, 8, 10, 11, 12, 13]
models = {}
errors = {}
count = 0
for a in max_depth:
    decision_tree = DecisionTreeRegressor(max_depth=a).fit(x_train, y_train)
    predicted_data = decision_tree.predict(x_test)
    errors[count] = mean_absolute_error(y_test, predicted_data)
    models[count] = decision_tree
    count +=1
    print(f'Mean absolute error of decision three model is {mean_absolute_error(y_test, predicted_data)}')
min_error = [k for k,v in errors.items() if v == min(errors.values())]
print()
decision_tree = models[min_error[0]]
scoreboard(decision_tree, predicted_data)
max_depth = [6, 7, 8, 10, 11, 12, 13]
models = {}
errors = {}
count = 0
for a in max_depth:
    decision_tree = DecisionTreeRegressor(max_depth=a).fit(x_train, y_train)
    predicted_data = decision_tree.predict(x_test)
    errors[count] = mean_absolute_error(y_test, predicted_data)
    models[count] = decision_tree
    count +=1
    print(f'Mean absolute error of decision three model is {mean_absolute_error(y_test, predicted_data)}')
min_error = [k for k,v in errors.items() if v == min(errors.values())]
print()
decision_tree = models[min_error[0]]
scoreboard(decision_tree, predicted_data)

#Random Forest
n_estimators = [100, 200, 300, 400, 500, 1000]
models = {}
errors = {}
count = 0
for a in n_estimators:
    random_forest = RandomForestRegressor(n_estimators=a, max_depth=10).fit(x_train, y_train)
    predicted_data = random_forest.predict(x_test)
    errors[count] = mean_absolute_error(y_test, predicted_data)
    models[count] = random_forest
    count +=1
    print(f'Mean absolute error of decision three model is {mean_absolute_error(y_test, predicted_data)}')
min_error = [k for k,v in errors.items() if v == min(errors.values())]
print()
random_forest = models[min_error[0]]
scoreboard(random_forest, predicted_data)

#XGBOOST
n_estimators = [100, 200, 300, 400, 500, 1000]
max_depth = [3, 4, 5, 6, 7, 8, 10]
models = {}
errors = {}
count = 0
for a, b in zip(n_estimators, max_depth):
    xgboost = XGBRegressor(n_estimators=a, max_depth=b, eta=0.1, subsample=0.7, colsample_bytree=0.8).fit(x_train, y_train)
    predicted_data = xgboost.predict(x_test)
    errors[count] = mean_absolute_error(y_test, predicted_data)
    models[count] = xgboost
    count +=1
    print(f'Mean absolute error of decision three model is {mean_absolute_error(y_test, predicted_data)}')
min_error = [k for k,v in errors.items() if v == min(errors.values())]
print()
xgboost = models[min_error[0]]
scoreboard(xgboost, predicted_data)

 参数的重要性:

def plot_feature_importance(importance,names,model_type):

    #Create arrays from feature importance and feature names
    feature_importance = np.array(importance)
    feature_names = np.array(names)

    #Create a DataFrame using a Dictionary
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)

    #Sort the DataFrame in order decreasing feature importance
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    #Define size of bar plot
    plt.figure(figsize=(15,8))
    sns.set(font_scale=2)
    #Plot Searborn bar chart
    sns.barplot(x=fi_df['feature_importance'][:10], y=fi_df['feature_names'][:10])
    #Add chart labels    
    plt.title(model_type + ' FEATURE IMPORTANCE')
    plt.xlabel('FEATURE IMPORTANCE')
    plt.ylabel('FEATURE NAMES')

plot_feature_importance(xgboost.feature_importances_,x.columns,'XGBRegressor')

sub_csv = pd.read_csv('sample_submission.csv')  # 读取数据
sub_csv.head()

# look at where 'SalePrices' is null and replace df_train
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
test_csv = imp_mean.fit_transform(test_csv) 


models = [ridge, lasso, decision_tree, random_forest, xgboost]
for model in models:
    RMS = mean_squared_error(np.log(sub_csv['SalePrice']), np.log(model.predict(test_csv)), squared=False)
    print(f'Root-Mean-Squared-Error of {type(model).__name__} is {RMS}')
    
    data = pd.concat([sub_csv['Id'], pd.Series(model.predict(test_csv))], axis=1)
    data.rename(columns={"Id": "Id", 0: "SalePrice"}, inplace=True)
    data.to_csv(f'{type(model).__name__}_submission.csv', index=False)
#     data.info()
    print(f'Submission of {type(model).__name__} was successfully saved!')

在和python文件同一个目录下就能得到这个文件: XGBRegressor_submission.csv

然后提交这个1文件到Kaggle就行。

  • 2
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 3
    评论
"House Prices - Advanced Regression Techniques"是一个用于预测房屋价格的数据集。该数据集包含了79个房屋相关的征,包括地理位置、建筑材料、房间数量、房屋面积等。这些征作为输入,我们需要通过回归分析预测房屋的售价。 对于这个数据集,我们可以使用各种先进的回归技术来进行分析预测。以下是几种常见的回归技术: 1. 线回归:线回归是一种基本的回归方法,它试图通过征和目标变量之间的线关系来建立预测模型。线回归的优点是简单且计算效率高。 2. 多项式回归:多项式回归是在线回归的基础上引入了高次项的回归方法。它可以更准确地拟合非线关系,并提高回归模型的灵活。 3. 支持向量回归(SVR):SVR是一种利用支持向量机技术进行回归的方法。它可以处理非线问题,并且对于异常值有较好的鲁棒。 4. 决策树回归:决策树回归是一种以树状结构来建立回归模型的方法。它可以自动选择征,并且对于非线关系有较好的拟合能力。 5. 随机森林回归:随机森林是一个基于多个决策树的集成算法。它可以通过组合多个决策树来提高回归模型的准确和稳定。 在使用这些回归技术进行预测时,我们通常需要进行征工程来处理缺失值、异常值和离散征等。此外,还需要进行模型评估和选择最优模型的步骤,如交叉验证和网格搜索。 总之,"House Prices - Advanced Regression Techniques"数据集是一个用于预测房屋价格的数据集,我们可以利用先进的回归技术来分析预测房屋价格。通过合理选择合适的回归模型和进行适当的征工程,我们可以提高预测准确并为房地产市场提供有价值的信息。
评论 3
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值