一、线性模型介绍
1.线性回归的基本形式
向量形式:
均方误差最小
求导:
令求导的数等于0
多元线性回归的矩阵表示:
均方误差最小:
对w求导得:
二、python多元线性程序
1.数据集描述:
数据集Advertising.csv:
数据集Advertising包含了200个不同市场的产品销售额,每个销售额对应3种广告媒体投入成本,分别是:TV, radio, 和 newspaper。如果我们能分析出广告媒体投入与销售额之间的关系,我们就可以更好地分配广告开支并且使销售额最大化。
第一步:导入数据
1、使用pandas库read_csv()读取数据集,得到相应200行,四列的矩阵。
第二步:图形描述
使用matplotlib库画出:TV、Radio、Newspaper与产品销售额的数据散点图。
第三步:多元线性回归拟合与预测
TV、Radio、Newspaper的广告花费为自变量X、以销售额的值为因变量y。
#导入所需要的包
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.font_manager import FontProperties
import numpy as np
font_set = FontProperties(fname=r"c:\windows\fonts\simsun.ttc", size=12)
#读取文件
data=pd.read_csv(
'Advertising.csv', # 文件名
sep=',',
header=0,
index_col=0,
encoding='utf-8',
)
#定义画图函数
def graph1( data ):
TV = data.TV
Radio = data.radio
Newspaper = data.newspaper
Sales = data.sales
plt.scatter(TV, Sales, c='r',marker='o',label='TV')
plt.scatter(Radio, Sales, c='b', marker='x', label='Radio')
plt.scatter(Newspaper, Sales, c='y', marker='d', label='Newspaper')
plt.legend()
plt.ylabel("销售额",fontproperties=font_set)
plt.xlabel('广告花费',fontproperties=font_set)
plt.grid(linestyle='-.')
plt.savefig('D://Advertising.png')
plt.show()
graph1(data)
#划分训练集与测试集
X = data[['TV','radio','newspaper']]
y = data['sales']
X = X.astype(np.float32)
#训练集与测试集的简单划分
offset = int(X.shape[0] * 0.8)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]
y_train = y_train.reshape((-1,1))
y_test = y_test.reshape((-1,1))
print('X_train=', X_train.shape)
print('X_test=', X_test.shape)
print('y_train=', y_train.shape)
print('y_test=', y_test.shape)
#定义损失函数
def linear_loss(X, y, w, b):
#读取矩阵的行数,样本的个数
num_train = X.shape[0]
#读取矩阵的列数,及属性的个数
y_hat = np.dot(X, w) + b
# 损失函数,
loss = np.sum((y_hat - y) ** 2) / (num_train)
# 参数的偏导
dw = np.dot(X.T, (y_hat - y)) / num_train
db = np.sum((y_hat - y)) / num_train
return y_hat, loss, dw, db
#参数初始化:
#初始化矩阵
def initialize_params(dims):
w = np.zeros((dims, 1))
b = 0
return w, b
#基于梯度下降的模型训练过程:
def linar_train(X, y, learning_rate, loop_max,epsilon=0.0001):
w, b = initialize_params(X.shape[1])
error = np.zeros(X.shape[1]+1)
loss_list = []
flag = 0
i=0
while flag == 0 and i< loop_max:
# 计算当前预测值、损失和参数偏导
y_hat, loss, dw, db = linear_loss(X, y, w, b)
loss_list.append(loss)
# 基于梯度下降的参数更新过程
w += -learning_rate * dw
b += -learning_rate * db
# 打印迭代次数和损失
if i % 10000 == 0:
print('loop_max %d loss %f' % (i, loss))
# 保存参数
params = {
'w': w,
'b': b
}
# 保存梯度
grads = {
'dw': dw,
'db': db
}
# 判断是否已收敛
w_new=np.insert(w,X.shape[1], values=b, axis=0)
if np.linalg.norm(w_new- error) < epsilon:
flag = 1
else:
error = w_new
i += 1
print ('loop count = %d' % i, '\tw:',w)
return loss_list, loss, params, grads
loss_list, loss, params, grads = linar_train(X_train, y_train, learning_rate=0.00005, loop_max=5000)
print(params)
#定义预测函数
def predict(X, params):
w = params['w']
b = params['b']
y_pred = np.dot(X, w) + b
return y_pred
y_pred = predict(X_test, params)
#画图,预测值与真实值之间的对比
plt.figure()
plt.plot(range(len(y_pred)),y_pred,'b',label="sales_predict")
plt.plot(range(len(y_pred)),y_test,'r',label="sales_test")
plt.legend(loc="upper right")
plt.show()
参考文献:https://blog.csdn.net/qq_38054219/article/details/89667830