原数据集的target都是正值,但按如下代码做出来的预测是负值
from sklearn.datasets import load_boston boston=load_boston() x=boston.data y=boston.target from sklearn.model_selection import train_test_split xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=35) from sklearn.preprocessing import StandardScaler xs=StandardScaler() ys=StandardScaler() xtrain=xs.fit_transform(xtrain) ytrain=ys.fit_transform(ytrain) ytest=ys.fit_transform(ytest) xtest=xs.fit_transform(xtest) from sklearn.linear_model import LinearRegression lr=LinearRegression() lr.fit(xtrain,ytrain) ypredict=lr.predict(xtest) xwp=boston.data[0] lr.predict(xwp)
模型训练之前先对数据进行了归一化,所以预测的时候需要把训练好的参数逆向调整回归一化之前。这里可以使用训练集的数据对方差以及均值进行估计。可以试一下下面的代码。
from sklearn.datasets import load_boston boston=load_boston() x=boston.data y=boston.target from sklearn.model_selection import train_test_split xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=35) from sklearn.preprocessing import StandardScaler xs=StandardScaler() ys=StandardScaler() xtrain_norm=xs.fit_transform(xtrain) ytrain_norm=ys.fit_transform(ytrain) ytest_norm=ys.fit_transform(ytest) xtest_norm=xs.fit_transform(xtest) from sklearn.linear_model import LinearRegression lr=LinearRegression() lr.fit(xtrain_norm,ytrain_norm) xtrain_std = np.std(xtrain, axis=0) xtrain_mean = np.mean(xtrain, axis=0) ytrain_std = np.std(ytrain, axis=0) ytrain_mean = np.mean(ytrain, axis=0)
calculate prediction
xwp=boston.data[0] prediction = ytrain_mean + ytrain_std*(np.dot((xwp - xtrain_mean)/xtrain_std, lr.coef_)) print prediction
另外sklearn的线性模型本身提供了做标准化的选项,所以可以直接用下面的办法避免自己做标准化的步骤:
from sklearn.datasets import load_boston boston=load_boston() x=boston.data y=boston.target from sklearn.model_selection import train_test_split xtrain,xtest,ytrain,ytest=train_test_split(x,y,test_size=0.2,random_state=35) from sklearn.linear_model import LinearRegression lr=LinearRegression(normalize=True) lr.fit(xtrain, ytrain)
calculate prediction
xwp=boston.data[0] prediction = lr.predict(xwp) print prediction