房价预测(机器学习)

数据来源:Kaggle 标注是帮助自己学习这个流程

一、数据查看

#了解数据
#先把数据掉出来看看长什么样,tips注意几行几列(也就是维数),注意数据类型(数值,类别等),数据蕴含的信息


import numpy as np # linear algebra
import pandas as pd # data processing

train_df=pd.read_csv('train.csv',index_col=0)
test_df=pd.read_csv('test.csv',index_col=0)

train_df.head()

二、

%matplotlib inline
prices=pd.DataFrame({'price':train_df['SalePrice'],'log(price+1)':np.log1p(train_df["SalePrice"])})
prices.hist()
#上面是检验平滑化
y_train=np.log1p(train_df.pop('SalePrice'))
y_train.head()
#这里是发现可以平滑正态了于是开始操作一番y
all_df = pd.concat(objs=[train_df, test_df], axis=0)
#合并训练集与测试集

 三、

all_df['MSSubClass'].dtypes
all_df['MSSubClass'].value_counts()
pd.get_dummies(all_df['MSSubClass'],prefix='MSSubClass').head()
all_dummy_df=pd.get_dummies(all_df)
all_dummy_df.head()

四、

all_dummy_df.isnull().sum().sort_values(ascending=False).head(10)
mean_cols=all_dummy_df.mean()
mean_cols.head(10)
all_dummy_df=all_dummy_df.fillna(mean_cols)
all_dummy_df['LotFrontage'].isnull()#求的all_df中单列的缺失值数量
all_dummy_df.isnull().sum().sum()#求的all_df的缺失值数量

五、

numeric_cols=all_df.columns[all_df.dtypes != 'object']
numeric_cols

六、

#将数据集all分开回训练集和测试集
#建模原理:从训练集提取特征并放到测试集中预测
dummy_train_df=all_dummy_df.loc[train_df.index]
dummy_test_df=all_dummy_df.loc[test_df.index]
dummy_train_df.shape#数据检查
#ridge regression模型
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import Ridge
#将数据从dataframe转化为numpy下的array数组
X_train = dummy_train_df.values
X_test = dummy_test_df.values
from sklearn.linear_model import RidgeCV

alphas = np.logspace(-3, 2, 50)
clf = RidgeCV(alphas=alphas, store_cv_values=True)
clf.fit(X_train, y_train)
test_score = np.sqrt(np.mean(clf.cv_values_, axis=0))
import matplotlib.pyplot as plt
%matplotlib inline
plt.plot(alphas, test_scores)
plt.title("Alpha vs CV Error")

七、

#随机森林模型
from sklearn.ensemble import RandomForestRegressor
max_features = [.1, .3, .5, .7, .9, .99]
test_scores = []
for max_feat in max_features:
    clf = RandomForestRegressor(n_estimators=200, max_features=max_feat)
    test_score = np.sqrt(-cross_val_score(clf, X_train, y_train, cv=5, scoring='neg_mean_squared_error'))
    test_scores.append(np.mean(test_score))
plt.plot(max_features, test_scores)
plt.title("Max Features vs CV Error");

八、

ridge = Ridge(alpha=15)
rf = RandomForestRegressor(n_estimators=500, max_features=.3)
#ridge.fit(X_train, y_train)#fit是训练,用训练集
rf.fit(X_train, y_train)
#数据转换回去
y_ridge = np.expm1(ridge.predict(X_test))#predict是训练好可以用了
y_rf = np.expm1(rf.predict(X_test))
#取平均,最简单的ensemble
y_final = (y_ridge + y_rf) / 2
#结果生成展示如下
submission_df = pd.DataFrame(data= {'Id' : test_df.index, 'SalePrice': y_final})
submission_df.head(10)

  • 1
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值