分析
线性回归分为单变量线性回归、多变量线性回归
区别 多变量比单变量多个特征缩放
代码及注释
import numpy as np
import matplotlib.pyplot as plt
# 中文、负号
plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False
# 读取数据
data = np.loadtxt(r'ex1data2.txt',delimiter=',')
# 洗牌
np.random.seed(5)
np.random.permutation(data)
# 提取数据
x,y = data[:,:-1],data[:,-1]
# 数据预处理
def preProcess(x,y):
#特征缩放
x -= np.mean(x,0)
x /= np.std(x,0,ddof=1)
# 数据初始化
x = np.c_[np.ones(len(x)),x]
y = np.c_[y]
return x,y
x,y = preProcess(x,y)
# 切分
train_x,test_x = np.split(x,[int(0.7*len(x))])
train_y,test_y = np.split(y,[int(0.7*len(x))])
# 模型
def model(x,theta):
h = np.dot(x,theta)
return h
# 代价函数
def costFunc(h,y):
e = h - y
j = (1/(2*len(y)))*np.dot(e.T,e)
return j
# 梯度下降函数
def gradDesc(x,y,alpha=0.01,max_iter = 10000):
m,n = x.shape
# 初始化
theta = np.zeros((n,1))
j_history = np.zeros(max_iter)
for i in range(max_iter):
h = model(x,theta)
j_history[i] = costFunc(h,y)
deltatheta = (1/m)*np.dot(x.T,h-y)
theta -= deltatheta*alpha
return j_history,theta
j_history,theta = gradDesc(train_x,train_y)
print(theta)
plt.title('代价函数图像')
plt.plot(j_history)
plt.show()
# 测试值
train_h = model(train_x,theta)
test_h = model(test_x,theta)
# 精度
def score(h,y):
u = np.sum(np.square(h-y))
v = np.sum(np.square(y-np.mean(y)))
return 1-u/v
print('训练集精度:',score(train_h,train_y))
print('测试集精度:',score(test_h,test_y))
plt.title('真实值与测试值对比图')
plt.scatter(train_y,train_y,label='真实值')
plt.scatter(train_y,train_h,label='测试值')
plt.legend()
plt.show()
效果展示