import numpy as np
import pandas as pd
from sklearn import linear_model,datasets,metrics
import matplotlib.pyplot as plt
data=pd.read_csv('Folds5x2_pp.csv')
## print(data) ### 9568*5
X = data[['AT', 'V', 'AP', 'RH']]
### print(X.shape) ### 9568*4
y = data[['PE']]
### print(y.shape) ### 9568*1
from sklearn.model_selection import train_test_split
## 随机划分数据集,测试集占25%,设定随机种子,每次选取的测试值都是一样的,实验可以重现
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=1)
# print(X_train) ### 7176*4
# print(X_test) ### 2392*4
# print(y_train) ### 7176*1
# print(y_test) ### 2392*1
from sklearn.linear_model import LinearRegression
LR = LinearRegression()
### 对训练数据进行拟合训练
LR.fit(X_train, y_train)
### 输出参数,分别是截距(intercept_)和权重参数(coef_)
print('LR.intercept:\n',LR.intercept_)
print('LR.coef:\n',LR.coef_)
### 计算确定系数R^2,取值范[0,1],值越大,说明模拟的拟合度越好,对模型的解释能力越强
print('R^2:\n',LR.score(X_test,y_test))
### 根据测试数据计算预测值y_predict
y_predict=LR.predict(X_test)
### MSE为均方误差,用测试数据来验证,MSE为预测数据和测试数据误差平方和的均值
print ("MSE:",metrics.mean_squared_error(y_test,y_predict))
### RMSE为均方根无误差
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test,y_predict)))
LR.intercept:
[ 447.06297099]
LR.coef:
[[-1.97376045 -0.23229086 0.0693515 -0.15806957]]
R^2:
0.931716257578
MSE: 20.0804012021
RMSE: 4.48111606657
print('y_predict:\n',y_predict[0:5]) ### 输出预测值前5行
print('y_test:\n',y_test[0:5]) ### 输出测试值前5行
plt.scatter(y_test,y_predict,c='b',alpha=0.5,marker='*')
plt.xlabel('y_test')
plt.ylabel('y_predict')
plt.plot([y_test.min(),y_test.max()],[y_test.min(),y_test.max()],'k--',lw=4) ### 画出y=x这条线
plt.show()
y_predict:
[[ 459.32136845]
[ 433.9320719 ]
[ 474.84501331]
[ 434.21338967]
[ 452.56159683]]
y_test:
PE
5014 458.92
6947 430.55
9230 473.85
4290 435.02
6477 456.44
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
### 交叉验证 若cv=5,把训练集平均分成5份,其中4份作为训练集,剩余的一份作为验证集,一共有5中组合方式,
cross_predict = cross_val_predict(LR,X_train,y_train,cv=5)
print(cross_predict)
### print(cross_predict.shape) ### 7176*1
print ("MSE:",metrics.mean_squared_error(y_train,cross_predict))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_train,cross_predict)))
plt.scatter(y_train,cross_predict,c='b',alpha=0.5,marker='*')
plt.xlabel('y_train')
plt.ylabel('cross_predict')
plt.plot([y_train.min(),y_train.max()],[y_train.min(),y_train.max()],'k--',lw=4) ### 画出y=x这条线
plt.show()
MSE: 21.0332779559
RMSE: 4.5862051803