get_x_y.py
import pandas as pd
from sklearn.model_selection import train_test_split
def get_xy():
path = '******'
#读取数据
melbourne_data = pd.read_csv(path)
filtered_melbourne_data = melbourne_data.dropna(axis=0)
#获取x_y
y = filtered_melbourne_data.Price
melbourne_features = ['Rooms', 'Bathroom', 'Landsize', 'BuildingArea',
'YearBuilt', 'Lattitude', 'Longtitude']
X = filtered_melbourne_data[melbourne_features]
train_X, val_X, train_y, val_y = train_test_split(X, y, random_state=0)
x_y = [train_X, val_X, train_y, val_y, X, y]
return x_y
get_mae.py
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
def get_mae(max_leaf_nodes, train_X, val_X, train_y, val_y):
"""获得基于一个最大节点数的mae"""
#建模
model = DecisionTreeRegressor(max_leaf_nodes=max_leaf_nodes, random_state=0)
#拟合
model.fit(train_X, train_y)
#预测
preds_val =model.predict(val_X)
#评估
mae = mean_absolute_error(val_y, preds_val)
return mae
get_min_max_leaf_nodes.py
from get_x_y import get_xy
from get_mae import get_mae
def get_min_mln():
''''返回最好的最大节点数、最小的mae,x的列表,y的列表'''
x_y = get_xy()
x_list = []
mae_list = []
#找到最小mae
for x in range(5, 5000):
mae = get_mae(x, train_X=x_y[0], val_X=x_y[1], train_y=x_y[2], val_y=x_y[3])
x_list.append(x)
mae_list.append(mae)
#最好节点数
min_mae = min(mae_list)
num_min_mae = mae_list.count(min_mae)
print(num_min_mae)
best_x = x_list[mae_list.index(min_mae)]
print(best_x, min_mae)
return [best_x, min_mae, x_list, mae_list]
final_model.py
from get_min_max_leaf_notes import get_min_mln
from get_x_y import get_xy
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_absolute_error
#读取最佳max_leaf_nodes
get_list = get_min_mln()
min_mln = get_list[0]
#define
final_model = DecisionTreeRegressor(max_leaf_nodes=min_mln, random_state=1)
#fit
X_y = get_xy()
final_model.fit(X_y[4], X_y[5])
#predict
pred = final_model.predict(X_y[4])
print(pred, mean_absolute_error(X_y[5], pred))