In [1]:
# 载入此项目所需要的库
import numpy as np
import pandas as pd
import visuals as vs # Supplementary code
# 检查你的Python版本
from sys import version_info
if version_info.major != 2 and version_info.minor != 7:
raise Exception('请使用Python 2.7来完成此项目')
# 让结果在notebook中显示
%matplotlib inline
In [2]:
# 载入波士顿房屋的数据集
data = pd.read_csv('housing.csv')
prices = data['MEDV']
features = data.drop('MEDV', axis = 1)
# 完成
print "Boston housing dataset has {} data points with {} variables each.".format(*data.shape)
In [3]:
#TODO 1
#目标:计算价值的最小值
minimum_price = np.min(prices)
#目标:计算价值的最大值
maximum_price = np.max(prices)
#目标:计算价值的平均值
mean_price = np.mean(prices)
#目标:计算价值的中值
median_price = np.median(prices)
#目标:计算价值的标准差
std_price = np.std(prices)
#目标:输出计算的结果
print "Statistics for Boston housing dataset:\n"
print "Minimum price: ${:,.2f}".format(minimum_price)
print "Maximum price: ${:,.2f}".format(maximum_price)
print "Mean price: ${:,.2f}".format(mean_price)
print "Median price ${:,.2f}".format(median_price)
print "Standard deviation of prices: ${:,.2f}".format(std_price)
In [4]:
# 载入画图所需要的库 matplotlib
import matplotlib.pyplot as plt
# 使输出的图像以更高清的方式显示
%config InlineBackend.figure_format = 'retina'
# 调整图像的宽高
plt.figure(figsize=(16, 4))
for i, key in enumerate(['RM', 'LSTAT', 'PTRATIO']):
plt.subplot(1, 3, i+1)
plt.xlabel(key)
plt.scatter(data[key], data['MEDV'], alpha=0.5)
In [5]:
# TODO 2
# 提示: 导入train_test_split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,prices,test_size = 0.2, random_state = 40)
In [6]:
# TODO 3
# 提示: 导入r2_score
def performance_metric(y_true, y_predict):
"""计算并返回预测值相比于预测值的分数"""
from sklearn.metrics import r2_score
score = r2_score(y_true,y_predict)
return score
In [7]:
# TODO 3 可选
# 不允许导入任何计算决定系数的库
def performance_metric2(y_true, y_predict):
"""计算并返回预测值相比于预测值的分数"""
y_bar = np.mean(y_true)
y_true_tot = 0
for i in range(0,len(y_true)):
y_true_tot += (y_true[i]-y_bar)**2
y_true_res = 0
if (len(y_true)!=len(y_predict)):
print "数据长度不等"
return
else:
for i in range(0,len(y_true)):
y_true_res += (y_true[i]-y_predict[i])**2
score = 1 - float(y_true_res)/float(y_true_tot)
return score
In [8]:
# 计算这个模型的预测结果的决定系数
score = performance_metric2([3, -0.5, 2, 7, 4.2], [2.5, 0.0, 2.1, 7.8, 5.3])
print "Model has a coefficient of determination, R^2, of {:.3f}.".format(score)
In [40]:
# 根据不同的训练集大小,和最大深度,生成学习曲线
vs.ModelLearning(X_train, y_train)
In [41]:
# 根据不同的最大深度参数,生成复杂度曲线
vs.ModelComplexity(X_train, y_train)
In [42]:
# TODO 4
#提示: 导入 'KFold' 'DecisionTreeRegressor' 'make_scorer' 'GridSearchCV'
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
def fit_model(X, y):
""" 基于输入数据 [X,y],利于网格搜索找到最优的决策树模型"""
cross_validator = KFold(n_splits = 3)
regressor = DecisionTreeRegressor()
params ={'max_depth':np.arange(1,11)}
scoring_fnc = make_scorer(performance_metric)
grid = GridSearchCV(regressor,params,scoring = scoring_fnc,cv = cross_validator)
# 基于输入数据 [X,y],进行网格搜索
grid = grid.fit(X, y)
print pd.DataFrame(grid.cv_results_)
# 返回网格搜索后的最优模型
return grid.best_estimator_
In [43]:
# TODO 4 可选
'''
不允许使用 DecisionTreeRegressor 以外的任何 sklearn 库
提示: 你可能需要实现下面的 cross_val_score 函数
def cross_val_score(estimator, X, y, scoring = performance_metric, cv=3):
""" 返回每组交叉验证的模型分数的数组 """
scores = [0,0,0]
return scores
'''
def cross_val_score(estimator, X, y, scoring = performance_metric, cv=3):
""" 返回每组交叉验证的模型分数的数组 """
n1 = len(X)/3
n2 = 2*n1
X1= X[:n1]
y1 = y[:n1]
X2 = X[n1:n2]
y2 = y[n1:n2]
X3 = X[n2:]
y3 = y[n2:]
pred1 = estimator
pred2 = estimator
pred3 = estimator
pred1.fit(np.append(X2,X3, axis =0),np.append(y2,y3,axis =0))
y1pred = pred1.predict(X1)
pred2.fit(np.append(X1,X3, axis =0),np.append(y1,y3,axis =0))
y2pred = pred2.predict(X2)
pred3.fit(np.append(X1,X2, axis =0),np.append(y1,y2,axis =0))
y3pred = pred3.predict(X3)
scores = [performance_metric(y1,y1pred),performance_metric(y2,y2pred),performance_metric(y3,y3pred)]
return scores
def fit_model2(X, y):
""" 基于输入数据 [X,y],利于网格搜索找到最优的决策树模型"""
dictModel = {}
from sklearn.tree import DecisionTreeRegressor
for nDepth in range(1,11):
reg = DecisionTreeRegressor(max_depth = nDepth)
score = cross_val_score(reg,X,y,scoring = performance_metric, cv =3)
#print score
dictModel[nDepth] = np.mean(score)
print dictModel
MaxScoreItem = max(dictModel.items(), key=lambda x: x[1])
#最优交叉验证分数对应的最优模型
best_estimator = DecisionTreeRegressor(max_depth = MaxScoreItem[0])
return best_estimator
In [48]:
# 基于训练数据,获得最优模型
optimal_reg = fit_model(X_train, y_train)
# 输出最优模型的 'max_depth' 参数
print "Parameter 'max_depth' is {} for the optimal model.".format(optimal_reg.get_params()['max_depth'])
In [251]:
# 生成三个客户的数据
client_data = [[5, 17, 15], # 客户 1
[4, 32, 22], # 客户 2
[8, 3, 12]] # 客户 3
# 进行预测
predicted_price = optimal_reg.predict(client_data)
for i, price in enumerate(predicted_price):
print "Predicted selling price for Client {}'s home: ${:,.2f}".format(i+1, price)
In [252]:
#TODO 5
# 提示:你可能需要用到 X_test, y_test, optimal_reg, performance_metric
# 提示:你可能需要参考问题10的代码进行预测
# 提示:你可能需要参考问题3的代码来计算R^2的值
y_pred = optimal_reg.predict(X_test)
r2 = performance_metric(y_test,y_pred)
print "Optimal model has R^2 score {:,.2f} on test data".format(r2)
In [253]:
# 请先注释掉 fit_model 函数里的所有 print 语句
vs.PredictTrials(features, prices, fit_model, client_data)
In [254]:
# TODO 6
# 你的代码
data1 = pd.read_csv('bj_housing.csv')
BJprices = data1['Value']
BJfeatures = data1.drop('Value', axis = 1)
# 完成
print "BeiJing housing dataset has {} data points with {} variables each.".format(*data1.shape)
#from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(BJfeatures,BJprices,test_size = 0.2, random_state = 42)
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
def fit_model(X, y):
""" 基于输入数据 [X,y],利于网格搜索找到最优的决策树模型"""
cross_validator = KFold(n_splits = 5)
regressor = DecisionTreeRegressor()
params ={'max_depth':np.arange(1,15)}
scoring_fnc = make_scorer(performance_metric)
grid = GridSearchCV(regressor,params,scoring = scoring_fnc,cv = cross_validator)
# 基于输入数据 [X,y],进行网格搜索
grid = grid.fit(X, y)
#print pd.DataFrame(grid.cv_results_)
# 返回网格搜索后的最优模型
return grid.best_estimator_
reg = fit_model(X_train,y_train)
print reg
pred_train = reg.predict(X_train)
print pred_train
pred_test = reg.predict(X_test)
print pred_test
print "测试数据得分:",performance_metric(y_test,pred_test)