房价预测（机器学习）

最新推荐文章于 2024-08-14 23:20:10 发布

xiǎo xiáo xiāo

最新推荐文章于 2024-08-14 23:20:10 发布

阅读量169

点赞数 1

文章标签：机器学习人工智能

本文链接：https://blog.csdn.net/weixin_47868398/article/details/139888163

版权

数据来源：Kaggle 标注是帮助自己学习这个流程

一、数据查看

#了解数据
#先把数据掉出来看看长什么样，tips注意几行几列（也就是维数），注意数据类型（数值，类别等）,数据蕴含的信息


import numpy as np # linear algebra
import pandas as pd # data processing

train_df=pd.read_csv('train.csv',index_col=0)
test_df=pd.read_csv('test.csv',index_col=0)

train_df.head()

二、

%matplotlib inline
prices=pd.DataFrame({'price':train_df['SalePrice'],'log(price+1)':np.log1p(train_df["SalePrice"])})
prices.hist()
#上面是检验平滑化

y_train=np.log1p(train_df.pop('SalePrice'))
y_train.head()
#这里是发现可以平滑正态了于是开始操作一番y

all_df = pd.concat(objs=[train_df, test_df], axis=0)
#合并训练集与测试集

三、

all_df['MSSubClass'].dtypes
all_df['MSSubClass'].value_counts()

pd.get_dummies(all_df['MSSubClass'],prefix='MSSubClass').head()

all_dummy_df=pd.get_dummies(all_df)
all_dummy_df.head()

四、

all_dummy_df.isnull().sum().sort_values(ascending=False).head(10)
mean_cols=all_dummy_df.mean()
mean_cols.head(10)
all_dummy_df=all_dummy_df.fillna(mean_cols)
all_dummy_df['LotFrontage'].isnull()#求的all_df中单列的缺失值数量
all_dummy_df.isnull().sum().sum()#求的all_df的缺失值数量

五、

numeric_cols=all_df.columns[all_df.dtypes != 'object']
numeric_cols

六、

#将数据集all分开回训练集和测试集
#建模原理：从训练集提取特征并放到测试集中预测
dummy_train_df=all_dummy_df.loc[train_df.index]
dummy_test_df=all_dummy_df.loc[test_df.index]

dummy_train_df.shape#数据检查

#ridge regression模型
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
#将数据从dataframe转化为numpy下的array数组
X_train = dummy_train_df.values
X_test = dummy_test_df.values
from sklearn.linear_model import RidgeCV

alphas = np.logspace(-3, 2, 50)
clf = RidgeCV(alphas=alphas, store_cv_values=True)
clf.fit(X_train, y_train)
test_score = np.sqrt(np.mean(clf.cv_values_, axis=0))
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(alphas, test_scores)
plt.title("Alpha vs CV Error")

七、

#随机森林模型
from sklearn.ensemble import RandomForestRegressor
max_features = [.1, .3, .5, .7, .9, .99]
test_scores = []
for max_feat in max_features:
    clf = RandomForestRegressor(n_estimators=200, max_features=max_feat)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
plt.plot(max_features, test_scores)
plt.title("Max Features vs CV Error");

八、

ridge = Ridge(alpha=15)
rf = RandomForestRegressor(n_estimators=500, max_features=.3)
#ridge.fit(X_train, y_train)#fit是训练，用训练集
rf.fit(X_train, y_train)
#数据转换回去
y_ridge = np.expm1(ridge.predict(X_test))#predict是训练好可以用了
y_rf = np.expm1(rf.predict(X_test))
#取平均，最简单的ensemble
y_final = (y_ridge + y_rf) / 2
#结果生成展示如下
submission_df = pd.DataFrame(data= {'Id' : test_df.index, 'SalePrice': y_final})
submission_df.head(10)