1.获取数据
对于想深入了解线性回归的童鞋,这里给出一个完整的例子,详细学完这个例子,对用scikit-learn来运行线性回归,评估模型不会有什么问题了。
获取数据,定义问题
这里我们用UCI大学公开的机器学习数据来跑线性回归。
数据的介绍在这: http://archive.ics.uci.edu/ml/datasets/Combined+Cycle+Power+Plant
数据的下载地址在这: http://archive.ics.uci.edu/ml/machine-learning-databases/00294/
里面是一个循环发电场的数据,共有9568个样本数据,每个数据有5列,分别是:AT(温度), V(压力), AP(湿度), RH(压强), PE(输出电力)。
我们的问题是得到一个线性的关系,对应PE是样本输出,而AT/V/AP/RH这4个是样本特征, 机器学习的目的就是得到一个线性回归模型,即:
PE=θ0+θ1∗AT+θ2∗V+θ3∗AP+θ4∗RHPE=θ0+θ1∗AT+θ2∗V+θ3∗AP+θ4∗RH
而需要学习的,就是θ0,θ1,θ2,θ3,θ4θ0,θ1,θ2,θ3,θ4这5个参数。
#coding:utf-8
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn import metrics
diabetes = np.loadtxt("F:\data\CCPP\Folds5x2_pp.csv",delimiter = ',',dtype=float,skiprows=1)
print(diabetes)
data = diabetes[:,0:4]
# print(data)
target = diabetes[:,4]
# print(target)
# 打乱数据
x, y = shuffle(data, target, random_state=13)
print(x)
print(y)
offset = int(x.shape[0] * 0.9)
print("offset:%s" % offset)
x_train, y_train = x[:offset], y[:offset]
print("x_train.shape[1]:%s"%x_train.shape[1])
print("x_train.shape[0]:%s"%x_train.shape[0])
print("y_train.shape[0]:%s"%y_train.shape[0])
x_test, y_test = x[offset:], y[offset:]
y_train = y_train.reshape(-1, 1)
y_test = y_test.reshape(-1, 1)
#建立模型
model= LinearRegression()
# model = Ridge(alpha=10)
# model = Lasso()
# model = ElasticNet()
#开始训练
model.fit(x_train,y_train)
#测试
predict = model.predict(x_test)
print('predict:',predict)
print('y_test:',y_test)
num=0
for i in range(len(y_test)):
if abs(predict[i] - y_test[i]) <5 :
num +=1
# print("num:",num)
print("预测率:%s"%(num/len(y_test)))
print ("MSE:",metrics.mean_squared_error(y_test, predict))
print ("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, predict)))
#结果显示
# w=[]
print("coefficients:",model.coef_)
w = model.coef_[0]
print("intercept:",model.intercept_)
b = model.intercept_
# f = b + w[0]*x_train[:,0] +w[1]*x_train[:,1]+
# f = b + w*x_train
# print(f)
运算结果如下:
diabetes
[[ 8.34 40.77 1010.84 90.01 480.48]
[ 23.64 58.49 1011.4 74.2 445.75]
[ 29.74 56.9 1007.15 41.91 438.76]
...,
[ 15.99 43.34 1014.2 78.66 465.96]
[ 17.65 59.87 1018.58 94.65 450.93]
[ 23.68 51.3 1011.86 71.24 451.67]]
x_train
[[ 28.77 58.33 1013.73 36.6 ]
[ 13.42 41.74 1020.96 61.8 ]
[ 6.17 39.33 1012.57 93.32]
...,
[ 5.17 39.33 1009.68 94.19]
[ 10.25 41.46 1018.67 84.41]
[ 18.32 45. 1022.67 46.38]]
y_train
[ 448.06 473.45 491.54 ..., 485.46 479.28 471.43]
offset:8611
x_train.shape[1]:4
x_train.shape[0]:8611
y_train.shape[0]:8611
预测率:0.7439916405433646
MSE: 18.7396222877
RMSE: 4.32892853807
coefficients: [[-1.97273028 -0.23312582 0.06450914 -0.15567916]]
intercept: [ 451.86222609]
方法2:
#coding:utf-8
import numpy as np
from sklearn.utils import shuffle
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
from sklearn import metrics
import pandas as pd
from sklearn.cross_validation import train_test_split
data = pd.read_csv("F:\data\CCPP\Folds5x2_pp.csv")
x=data[['AT','V','AP','RH']]
# print(x.head())
y=data[['PE']]
print(y.head())
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state=1)
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
#建立模型
model= LinearRegression()
# model = Ridge(alpha=10)
# model = Lasso()
# model = ElasticNet()
#开始训练
model.fit(x_train,y_train)
#测试
predict = model.predict(x_test)
y_test=np.array(y_test)#将DataFrame转换为数组array
print('y_test:',type(y_test))
print('y_test:',y_test)
num=0
for i in range(len(y_test)):
if abs(predict[i] - y_test[i]) <5 :
num +=1
# print("num:",num)
print("预测率:%s"%(num/len(y_test)))
print ("MSE:",metrics.mean_squared_error(y_test, predict))
print ("RMSE:",np.sqrt(metrics.mean_squared_error(y_test, predict)))
#结果显示
print("coefficients:",model.coef_)
w = model.coef_[0]
print("intercept:",model.intercept_)
b = model.intercept_
预测率:0.7328595317725752
MSE: 20.0804012021
RMSE: 4.48111606657
coefficients: [[-1.97376045 -0.23229086 0.0693515 -0.15806957]]
intercept: [ 447.06297099]
注意:#将DataFrame转换为数组array的方法,参考如下文档:
https://blog.csdn.net/weixin_42263508/article/details/90487483
本算法参考了如下文档:
https://blog.csdn.net/lai19941994/article/details/81748660