# 线性回归实战 分析汽车油耗效率

## 模型分析

import pandas as pd
import matplotlib.pyplot as plt
#因为原数据集中没有指标，我需要先指定指标
columns = ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model year", "origin", "car name"]
cars = pd.read_table("auto-mpg.data", delim_whitespace=True, names=columns) #读数据


#拿到数据后先进行一下数据可视化，看有什么规律
fig = plt.figure()
cars.plot("weight", "mpg", kind='scatter', ax=ax1) #x：weight， y：mpg， 指定散点图
cars.plot("acceleration", "mpg", kind='scatter', ax=ax2)
plt.show()


import sklearn
from sklearn.linear_model import LinearRegression #导库
lr = LinearRegression(fit_intercept=True) #拿到模型
lr.fit(cars[["weight"]], cars["mpg"])  #fit模型训练数据， fit（input，output（label值））
predictions = lr.predict(cars[["weight"]]) #预测，参数是新数据
#对比一下预测和真实值
print(predictions[0:5])
print(cars["mpg"][0:5])


[19.41852276 17.96764345 19.94053224 19.96356207 19.84073631]
0 18.0
1 15.0
2 18.0
3 16.0
4 17.0
Name: mpg, dtype: float64

#可视化比较更直观
plt.scatter(cars["weight"], cars["mpg"], c='red')  #真实值
plt.scatter(cars["weight"], predictions, c='blue') #预测值
plt.show()


## 模型评估

#衡量一下模型的好坏，指标是MSE均方误差
lr = LinearRegression()
lr.fit(cars[["weight"]], cars["mpg"])
predictions = lr.predict(cars[["weight"]])
from sklearn.metrics import mean_squared_error #导入均方误差模块
mse = mean_squared_error(cars["mpg"], predictions) #参数是预测值和真实标签值
print(mse)


18.780939734628397

#一般用根号下MSE表示，更准确
mse = mean_squared_error(cars["mpg"], predictions)
rmse = mse ** (0.5)
print (rmse)


4.333698159150957

09-18

08-19 856
03-25 458
06-07 436