1.导入各种需要的包
import csv import numpy as np import matplotlib as mpl import matplotlib.pyplot as plt import pandas as pd from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from pprint import pprint2.读入数据
path = 'Advertising.csv'
data = pd.read_csv(path) # TV、Radio、Newspaper、Sales # x = data[['TV', 'Radio', 'Newspaper']] x = data[['TV', 'Radio']] y = data['Sales'] print x print y
3.训练模型
# 训练和测试 分开 train占0.8 默认为75 和 25 x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.8, random_state=1) print type(x_test ) #DataFrame 类型 print x_train.shape, y_train.shape linreg = LinearRegression() # 调用线性回归模型 model = linreg.fit(x_train, y_train) # 进行拟合 print model # 打印模型信息 # coef_ 对应Theta_1 ........;intercept_ 对应Theta_0 print linreg.coef_, linreg.intercept_
输出结果为:[ 0.04686997 0.1800065 ] 2.94751503603;表示Theta_0 = 2.947515036034.进行预测
# print y_test order = y_test.argsort(axis=0) #order 为 y_test的值按大小排序的索引号 原来的index不变 # print 'order:',order y_test = y_test.values[order] # 取出order中的索引号 作为y_test的排序方式 # print x_test x_test = x_test.values[order, :] # x_test 的第一列按 order排序 # print x_test y_hat = linreg.predict(x_test) # 对测试集进行预测
argsort 是按y_tset 的值从小到大排序, 返回每个值对应的index ,y_test自带的index不变.
5.计算MES和R2
mse = np.average((y_hat - np.array(y_test)) ** 2) # Mean Squared Error rmse = np.sqrt(mse) # Root Mean Squared Error print 'MSE = ', mse, print 'RMSE = ', rmse print 'R2 = ', linreg.score(x_train, y_train) print 'R2 = ', linreg.score(x_test, y_test)R^2 = 1-RSS/TSS 越大效果越好。1为最优值。用score计算R^2
6.画图
plt.figure(facecolor='w') t = np.arange(len(x_test)) plt.plot(t, y_test, 'r-', linewidth=2, label=u'真实数据') plt.plot(t, y_hat, 'g-', linewidth=2, label=u'预测数据') plt.legend(loc='upper right') plt.title(u'线性回归预测销量', fontsize=18) plt.grid(b=True) plt.show()