回归模型------留一法交叉验证和K折交叉验证

# coding=UTF-8 
#FileName:   交叉验证.py
#######################################################
######### 留一法数据库为csv类型测MSE##################################
#模型读取文件那301个数据只显示300个,有点小问题
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
data=pd.read_csv('csv类型文件的路径')
df=data.values
df=np.array(df)
for i in range(9):
    df[:,i]=(df[:,i]-df[:,i].min())/(df[:,i].max()-df[:,i].min())
x=df[:,:9]
y=df[:,9]
clf = RandomForestRegressor(criterion='mse',bootstrap=False,max_features='sqrt', max_depth=23,min_samples_split=12, n_estimators=1400,min_samples_leaf=2)
# 300个数据
names = range(0,300)
names = [str(t) for t in list(names)]
t = range(len(names))
point = []

loo = LeaveOneOut()
for train_i, test_i in loo.split(df):
    print("%s %s" % (train_i, test_i))
    clf.fit(x[train_i], y[train_i])
    y_test_pred=clf.predict(x[test_i])
    mse = mean_squared_error(y[test_i], y_test_pred)
    point.append(mse)
    print('Sample %d 均方误差:%f' % (test_i[0], mean_squared_error(y[test_i], y_test_pred)))

plt.plot(t, point, ms=3, label='RandomForest-Leave-out-cv')
plt.xlabel('RSECrossValidation')
plt.ylabel('Data-ID')
plt.legend('best')
plt.grid()
plt.show()

#######################################################
######### 留一法数据库为加载类型测MSE##################################
# load_linnerud用于回归的数据集
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.datasets import load_linnerud
# from sklearn.model_selection import LeaveOneOut
# from sklearn.metrics import mean_squared_error
# import matplotlib.pyplot as plt
#
# dataset = load_linnerud()
# print(dataset)
# # 数据加数据名称20*320*3列
# x, y = dataset.data, dataset.target
# print(x.shape)
# print(y.shape)
# clf = RandomForestRegressor(n_estimators=500)
# # 7个数据,x坐标从06
# names = range(0,7)
# names = [str(t) for t in list(names)]
# t = range(len(names))
# point = []
#
# loo = LeaveOneOut()
# for train_i, test_i in loo.split(dataset):
#     print("%s %s" % (train_i, test_i))
#     clf.fit(x[train_i], y[train_i])
#     y_test_pred=clf.predict(x[test_i])
#     mse = mean_squared_error(y[test_i], y_test_pred)
#     point.append(mse)
#     print('Sample %d 均方误差:%f' % (test_i[0], mean_squared_error(y[test_i], y_test_pred)))
#     # print('均方误差', mean_squared_error(y[test_i], y_test_pred))
#
# plt.plot(t, point, ms=3, label='RandomForest-Leave-out-cv')
# plt.legend(loc='best')
# plt.grid()
# plt.show()


#######################################################
#########k折交叉验证数据库为csv类型选参数##################################

# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import mean_squared_error
# import matplotlib.pyplot as plt
# from sklearn.model_selection import train_test_split
# from sklearn.model_selection import GridSearchCV
# import numpy as np
# import pandas as pd
#
# # # # # # # # # # # # # # # # # # # # # # # # # # # # # #
# data=pd.read_csv('csv类型文件的路径')
# df=data.values
# df=np.array(df)
# #对特征数据[08]列做(0-1)归一化,10--->需要9列
# for i in range(9):
#     df[:,i]=(df[:,i]-df[:,i].min())/(df[:,i].max()-df[:,i].min())
#
# x = data[['Dosef','Hform','Natoms','Mass',	'Cellarea',	'Energy','Fmax',	'Smax','Volume'	]]
# y = data['GAP']
# # random_state=1就是把数据集划分每次都一样,种子不同,产生不同的随机数;种子相同,即使实例不同也产生相同的随机数
# x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.1,random_state=1)
#
# model =  RandomForestRegressor(n_estimators=500)
# max_depth=[5,10]
# # cv=5就是把数据集划分成5等份
# rf_model = GridSearchCV(model, param_grid={'max_depth': max_depth} ,cv=5)
# # rf_model.fit(x, y)
# rf_model.fit(x_train, y_train)
# y_pred=rf_model.predict(x_test)
# print(mean_squared_error(y_test, y_pred))
#######################################################
#########k折交叉验证数据库为csv类型测MSE##################################
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import numpy as np
import pandas as pd
data=pd.read_csv('csv类型文件的路径')
df=data.values
df=np.array(df)
for i in range(9):
    df[:,i]=(df[:,i]-df[:,i].min())/(df[:,i].max()-df[:,i].min())
x=df[:,:9]
y=df[:,9]
clf = RandomForestRegressor(criterion='mse',bootstrap=False,max_features='sqrt', max_depth=23,min_samples_split=12, n_estimators=1400,min_samples_leaf=2)

names = range(0,3)
names = [str(t) for t in list(names)]
t = range(len(names))
point = []
# 将数据集均分三等分,选一份作为测试集,其他两份作为训练集,循环交替
kf = KFold(n_splits=3)
for train_i, test_i in kf.split(df):
    print("%s %s" % (train_i, test_i))
    clf.fit(x[train_i], y[train_i])
    y_test_pred=clf.predict(x[test_i])
    mse = mean_squared_error(y[test_i], y_test_pred)
    point.append(mse)
    print('Sample %d 均方误差:%f' % (test_i[0], mean_squared_error(y[test_i], y_test_pred)))

plt.plot(t, point, ms=3, label='KFold')
plt.xlabel('set')
plt.ylabel('RSECrossValidation')
plt.legend('best')
plt.grid()
plt.show()

  • 6
    点赞
  • 26
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值