# 机器学习-使用决策树DecisionTreeRegressor模型对水果蔬菜价格预测

## 回归正题

1.明确数据集是用来做什么的
2.数据的处理
3.特征工程
4.利用决策树进行数据预测

### 数据集

fruitvegprices-2017_2022.csv
1.1对数据进行基础分析


python
'行列信息：',data.shape#检查形状
'检查唯一值：',data.nunique#检查唯一值
'检查是否有缺失值：\n',data.isnull().sum()#检查是否有缺失值
data.info()
data.price.describe()


### 1.1 数据集预处理

1.1.1 [‘category’,‘item’,‘variety’]

1.1.2 [date]

### 1.2 标准化

X=(x-mean)/var*0.5

def unitize(dataset):
'''
unitize:
标准化特征值
return:
标准化后的数据集 normalization
'''
m = dataset.shape[0]
x = 0
mean = np.mean(dataset,axis=0)
n = np.tile(mean,(m,1))
normalization = pd.DataFrame(dataset) - n
for i in list(dataset):
x+=1
std = pow((pd.DataFrame(dataset) - np.tile(mean,(m,1)))**2 / x,0.5)
normalization = normalization / std
return normalization


### 1.3 归一化

X=(x-min)/(max-min)

def Stand(dataset):
"""
Stand:
归一化特征值
return:
归一化后的数据集 Dataset
"""
#准备数据 max min man_min
mindata = dataset.min(0)
maxdata = dataset.max(0)
max_min = maxdata-mindata
m = dataset.shape[0]
Dataset = np.zeros(np.shape(dataset))
n = np.tile(mindata,(m,1)) #组成最小值得矩阵
Dataset = pd.DataFrame(dataset) - n
Dataset = Dataset / np.tile(max_min, (m, 1))
return Dataset



### 1.4 特征工程

x=pd.concat([data.drop(['price','item','variety'],axis=1),item_category],axis=1)
y = data['price']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25，random_state=0)


### scikit-learn 决策树API

scikit-learn的DecisionTreeRegressor 中的特征选择选择是均方误差，也可以选择前面提到的“entropy”，也可以是“gini”，”mae“。

def __init__(
self,
*,
criterion="squared_error",
splitter="best",
max_depth=None,
min_samples_split=2,
min_samples_leaf=1,
min_weight_fraction_leaf=0.0,
max_features=None,
random_state=None,
max_leaf_nodes=None,
min_impurity_decrease=0.0,
ccp_alpha=0.0,):


#对深度进行交叉验证
depth = list(range(1,30))
param_grid = dict(max_depth=depth)
tree = GridSearchCV(DecisionTreeRegressor(),param_grid,cv=10,refit=True)
tree.fit(x_train,y_train)
print("Best parameter:",tree.best_params_,
"\nBest Estimator:",tree.best_estimator_,
"\nBest Score:",tree.best_score_)


### 1.5 通拟合优度来评判模型训练

R^2→1，模型的数据拟合性就越好
R^2→0，模型的数据拟合度越差

    dt_train_pred = dtr.predict(x_train)
dt_test_pred = dtr.predict(x_test)
print("测试集：",r2_score(y_test,dt_test_pred))
print("训练集：",r2_score(y_train,dt_train_pred))


### 1.6 预测与真实可视化

 x_data = y_test[300:500]
y_data = dt_test_pred[300:500]
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['font.serif'] = ['SimHei']  # 设置正常显示中文
plt.title('预测值（蓝色）与真实值（红色）对比')
plt.plot(x_data,color='red',linewidth=2,linestyle='-')
plt.plot(y_data,color='blue',linewidth=2,linestyle='-')
plt.show()



### 1.7 以均方误差为节点的可视化树

joblib.dump(dtr,"./f_V.pkl")
dot_data = tree.export_graphviz(dtr,out_file=None)
graph = graphviz.Source(dot_data)
graph.render("tree")


### 1.8 核心代码如下

def getLabel_date(DataPath):
'''
对标签，日期进行处理
:return: 处理好的数据data
'''
code = LabelEncoder()
labels = ['category','item','variety']
for label in labels:
data[label] = code.fit_transform(data[label]).astype(int)
data = data.drop(['unit'],axis=1)
# data['no_of_zeros']=(data == 0).astype(int).sum(axis=1)
# data=data[data['no_of_zeros'] <1].drop(['no_of_zeros'], axis=1)
data['date'] = pd.to_datetime(data['date'])
data['day'] = data['date'].dt.day
data['month'] = data['date'].dt.month
data['year'] = data['date'].dt.year
data = data.drop(['date'],axis=1)

y = data['year']
n = data['year'].shape[0]
data['year'] = -(pd.DataFrame(y) - np.tile(2022,(n,1)))
return data

def Standdata(data):
'''
data: 对连续的标签数据进行归一化,对归一化的数据保留两位小数便于分析
:return: item variety
'''
item = Stand(data['item'])
item = round(item,2)

variety = Stand(data['variety'])
variety = round(variety,2)
item_variety = pd.concat([item,variety],axis=1)
return item_variety

def TrainAndTest(data,item_variety):
'''
分割出测试集 训练集
:return: 四个数据
'''
x = pd.concat([data.drop(['price','item','variety',],axis=1),item_variety],axis=1)
y = data['price']
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=0)
return x_train,x_test,y_train,y_test

def TreeRegressor(x_train,y_train):
'''
利用网格搜索交叉验证进行调参 防止过拟合
:return: 调参后的数据重新防数回归树中进行测试
'''
depth = list(range(1,30))
param_grid = dict(max_depth=depth)
tree = GridSearchCV(DecisionTreeRegressor(),param_grid,cv=10,refit=True)
tree.fit(x_train,y_train)
means = tree.cv_results_['mean_test_score']
params = tree.cv_results_['params']
for mean,param in zip(means,params):
for depth in param:
# print("%f-:%r" % (mean,param[depth]))
mean_param = list([mean,param[depth]])
plt.plot(mean_param[1],mean_param[0],'b*')
plt.title('max_depth')
plt.show()

depth = list(range(2,30))
param_split = dict(min_samples_split=depth)
tree = GridSearchCV(DecisionTreeRegressor(max_depth=10),param_split,cv=5,refit=True)
tree.fit(x_train,y_train)
means = tree.cv_results_['mean_test_score']
params = tree.cv_results_['params']
for mean,param in zip(means,params):
for depth in param:
mean_param = list([mean,param[depth]])
plt.plot(mean_param[1],mean_param[0],'ko')
plt.title('min_samples_split')
print('最终交叉验证完后：',tree.score(x_test,y_test))
print("每个超参数每次交叉验证得结果：",tree.cv_results_)
plt.show()

samples_split = list(range(1,30))
param_leaf = dict(min_samples_leaf=samples_split)
tree = GridSearchCV(DecisionTreeRegressor(max_depth=10,min_samples_split=15),param_leaf,cv=5,refit=True)
tree.fit(x_train,y_train)
means = tree.cv_results_['mean_test_score']
params = tree.cv_results_['params']
for mean,param in zip(means,params):
for depth in param:
mean_param = list([mean,param[depth]])
plt.plot(mean_param[1],mean_param[0],'rd')
plt.title('min_samples_leaf')
plt.show()

print("Best parameter:",tree.best_params_,
"\nBest Score:",tree.best_score_)
dtr = DecisionTreeRegressor(criterion='squared_error',max_depth=10,min_samples_split=15,min_samples_leaf=1)
dtr.fit(x_train,y_train)

return dtr

def r2score_score(dtr,x_train,x_test,y_train,y_test):
dt_train_pred = dtr.predict(x_train)
dt_test_pred = dtr.predict(x_test)
print("测试集:{:1f}".format(r2score(y_test,dt_test_pred)))
print("训练集:{:1f}".format(r2score(y_train,dt_train_pred)))
print('均方误差:{:1f}'.format(M_S_E(y_test,dt_test_pred)))
print('平均绝对误差:{:1f}'.format(MAE(dt_test_pred,y_test)))
#print('平均误差率:{:1f}'.format(mean_absolute_percentage_error(y_test,dt_test_pred)))
print(mean_absolute_percentage_error(y_test,dt_test_pred))

y_test = pd.DataFrame.reset_index(pd.DataFrame(y_test),drop=True)
dt_test_pred = round(pd.DataFrame(dt_test_pred),2)
conpare = pd.concat([y_test,dt_test_pred],axis=1)
conpare = np.array(conpare)
print(conpare[300:500])
`
• 5
点赞
• 62
收藏
觉得还不错? 一键收藏
• 打赏
• 35
评论
02-08 2万+
05-10 1419
02-19
11-02
03-30
05-04 338

### “相关推荐”对你有帮助么？

• 非常没帮助
• 没帮助
• 一般
• 有帮助
• 非常有帮助

Leon在努力啊

¥1 ¥2 ¥4 ¥6 ¥10 ¥20

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。