Lesson-01 通过构建线性回归-理解Loss函数-梯度下降与函数拟合
Load Dataset
from sklearn.datasets import load_boston
data = load_boston()
X, y = data['data'], data['target']
X[1]
y[1]
X.shape
len(y)
%matplotlib inline
import matplotlib.pyplot as plt
plt.scatter(X[:, 5], y)
目标:就是要找一个“最佳”的直线,来拟合卧室和房价的关系
import random
k, b = random.randint(-100, 100), random.randint(-100, 100)
def func(x):
return k*x + b
X_rm = X[:, 5]
y_hat = [func(x) for x in X_rm]
plt.scatter(X[:, 5], y)
plt.plot(X_rm, y_hat)
随机画了一根直线,结果发现,离得很远?🙁
def draw_room_and_price():
plt.scatter(X[:, 5], y)
def price(x, k, b):
return k*x + b
k, b = random.randint(-100, 100), random.randint(-100, 100)
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
print('the random k : {}, b: {}'.format(k, b))
draw_room_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)
目标是想找到最“好”的K和b?
我们需要一个标准去衡量这个东西到底好不好
y_true, y ^ \hat{y} y^
衡量y_true, y ^ \hat{y} y^ -> 损失函数
y_true = [1, 4, 1, 4,1, 4, 1,4]
y_hat = [2, 3, 1, 4, 1, 41, 31, 3]
L1-Loss
l o s s = 1 n ∑ i n ∣ y t r u e − i − y i ^ ∣ loss = \frac{1}{n} \sum_{i}^{n}| y_{true-i} - \hat{y_i} | loss=n1i∑n∣ytrue−i−yi^∣
y_ture = [3, 4, 4]
y_hat_1 = [1, 1, 4]
y_hat_2 = [3, 4, 0]
L1-Loss 值是多少呢? |3 - 1| + |4-1|+ |4 -4| = 2 + 2 + 0 = 4
y 2 ^ \hat{y_2} y2^ L1-Loss |3-3| + |4-4|+|4-0| = 4 l o s s = 1 n ∑ i n ( y i − y i ^ ) 2 loss = \frac{1}{n} \sum_{i}^{n} (y_i - \hat{y_i}) ^ 2 loss=n1i∑n(yi−yi^)2
def loss(y, y_hat):
sum_ = sum([(y_i - y_hat_i) ** 2 for y_i, y_hat_i in zip(y, y_hat)])
return sum_ / len(y)
y_ture = [3, 4, 4]
y_hat_1 = [1, 1, 4]
y_hat_2 = [3, 4, 0]
print(loss(y_ture, y_hat_1))
print(loss(y_ture, y_hat_2))
def price(x, k, b):
return k*x + b
k, b = random.randint(-100, 100), random.randint(-100, 100)
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
print('the random k : {}, b: {}'.format(k, b))
draw_room_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)
cost = loss(list(y), price_by_random_k_and_b)
print('The Loss of k: {}, b: {} is {}'.format(k, b, cost))
Loss 一件事情你只要知道如何评价它好与坏 基本上就完成了一般了工作了
最简单的方法,我们随机生成若干组k和b,然后找到最佳的一组k和b
de
f price(x, k, b):
return k*x + b
trying_times = 5000
best_k, best_b = None, None
min_cost = float('inf')
losses = []
for i in range(trying_times):
k = random.random() * 100 - 200
b = random.random() * 100 - 200
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
#draw_room_and_price()
#plt.scatter(X_rm, price_by_random_k_and_b)
cost = loss(list(y), price_by_random_k_and_b)
if cost < min_cost:
min_cost = cost
best_k, best_b = k, b
print('在第{}, k和b更新了'.format(i))
losses.append(min_cost)
We could add a visualize
min_cost
best_k, best_b
def plot_by_k_and_b(k, b):
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
draw_room_and_price()
plt.scatter(X_rm, price_by_random_k_and_b)
plot_by_k_and_b(best_k, best_b)
2-nd 方法 进行方向的调整
k的变化有两种: 增大和减小
b的变化也有两种:增大和减小
k, b这一组值我们进行变化,就有4种组合:
当,k和b沿着某个方向 d n d_n dn变化的时候,如何,loss下降了,那么,k和b接下来就继续沿着 d n d_n dn这个方向走,否则,我们就换一个方向
directions = [
(+1, -1),
(+1, +1),
(-1, -1),
(-1, +1)
]
def price(x, k, b):
return k*x + b
trying_times = 10000
best_k = random.random() * 100 - 200
best_b = random.random() * 100 - 200
next_direction = random.choice(directions)
min_cost = float('inf')
losses = []
scala = 0.3
for i in range(trying_times):
current_direction = next_direction
k_direction, b_direction = current_direction
current_k = best_k + k_direction * scala
current_b = best_b + b_direction * scala
price_by_random_k_and_b = [price(r, current_k, current_b) for r in X_rm]
cost = loss(list(y), price_by_random_k_and_b)
if cost < min_cost:
min_cost = cost
best_k, best_b = current_k,current_b
print('在第{}, k和b更新了'.format(i))
losses.append((i, min_cost))
next_direction = current_direction
else:
next_direction = random.choice(list(set(directions) - {current_direction}))
len(losses)
min_cost
3-rd 梯度下降
我们能不能每一次的时候,都按照能够让它Loss减小方向走?
都能够找到一个方向
l
o
s
s
=
1
n
∑
i
n
(
y
i
−
y
^
)
∗
∗
2
loss = \frac{1}{n} \sum_i^n (y_i - \hat{y})**2
loss=n1i∑n(yi−y^)∗∗2
l
o
s
s
=
1
n
∑
i
n
(
y
i
−
(
k
∗
x
i
+
b
)
)
2
loss = \frac{1}{n} \sum_i^n (y_i - (k*x_i + b))^2
loss=n1i∑n(yi−(k∗xi+b))2
∂
l
o
s
s
∂
k
=
−
2
n
∑
(
y
i
−
(
k
x
i
+
b
)
)
x
i
\frac{\partial{loss}}{\partial{k}} = -\frac{2}{n}\sum(y_i - (kx_i + b))x_i
∂k∂loss=−n2∑(yi−(kxi+b))xi
∂
l
o
s
s
∂
b
=
−
2
n
∑
(
y
i
−
(
k
x
i
+
b
)
)
\frac{\partial{loss}}{\partial{b}} = -\frac{2}{n}\sum(y_i - (kx_i + b))
∂b∂loss=−n2∑(yi−(kxi+b))
∂
l
o
s
s
∂
k
=
−
2
n
∑
(
y
i
−
y
^
i
)
x
i
\frac{\partial{loss}}{\partial{k}} = -\frac{2}{n}\sum(y_i - \hat{y}_i)x_i
∂k∂loss=−n2∑(yi−y^i)xi
∂
l
o
s
s
∂
b
=
−
2
n
∑
(
y
i
−
y
^
i
)
\frac{\partial{loss}}{\partial{b}} = -\frac{2}{n}\sum(y_i - \hat{y}_i)
∂b∂loss=−n2∑(yi−y^i)
def partial_k(x, y, y_hat):
gradient = 0
for x_i, y_i, y_hat_i in zip(list(x), list(y), list(y_hat)):
gradient += (y_i - y_hat_i) * x_i
return -2 / len(y) * gradient
def partial_b(y, y_hat):
gradient = 0
for y_i, y_hat_i in zip(list(y), list(y_hat)):
gradient += (y_i - y_hat_i)
return -2 / len(y) * gradient
def price(x, k, b):
# Operation : CNN, RNN, LSTM, Attention 比KX+B更复杂的对应关系
return k*x + b
trying_times = 10000
min_cost = float('inf')
losses = []
scala = 0.3
k, b = random.random() * 100 - 200, random.random() * 100 - 200
参数初始化问题! Weight Initizalition 问题!
best_k, best_b = None, None
learning_rate = 1e-3 # Optimizer Rate
for i in range(trying_times):
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
cost = loss(list(y), price_by_random_k_and_b)
if cost < min_cost:
print('在第{}, k和b更新了'.format(i))
min_cost = cost
best_k, best_b = k, b
losses.append((i, min_cost))
k_gradient = partial_k(X_rm, y, price_by_random_k_and_b) # 变化的方向
b_gradient = partial_b(y, price_by_random_k_and_b)
k = k + (-1 * k_gradient) * learning_rate
## 优化器: Optimizer
## Adam 动量 momentum
b = b + (-1 * b_gradient) * learning_rate
cost
def price(x, k, b):
# Operation : CNN, RNN, LSTM, Attention 比KX+B更复杂的对应关系
return k*x + b
trying_times = 50000
min_cost = float('inf')
losses = []
scala = 0.3
k, b = random.random() * 100 - 200, random.random() * 100 - 200
参数初始化问题! Weight Initizalition 问题!
best_k, best_b = None, None
learning_rate = 1e-3 # Optimizer Rate
for i in range(trying_times):
price_by_random_k_and_b = [price(r, k, b) for r in X_rm]
cost = loss(list(y), price_by_random_k_and_b)
if cost < min_cost:
# print('在第{}, k和b更新了'.format(i))
min_cost = cost
best_k, best_b = k, b
losses.append((i, min_cost))
k_gradient = partial_k(X_rm, y, price_by_random_k_and_b) # 变化的方向
b_gradient = partial_b(y, price_by_random_k_and_b)
k = k + (-1 * k_gradient) * learning_rate
## 优化器: Optimizer
## Adam 动量 momentum
b = b + (-1 * b_gradient) * learning_rate
封装成一块一块儿的,别人用的时候,不需要重新在开始写了
len(losses)
print(min_cost)
best_k, best_b
def square(x):
return 10 * x**2 + 5 * x + 5
import numpy as np
_X = np.linspace(-100, 100)
_y = [square(x) for x in _X]
plt.plot(_X, _y)
plot_by_k_and_b(k=best_k, b=best_b)
plot_by_k_and_b(k=best_k, b=best_b)
draw_room_and_price()
min_cost
Cost如何更小呢?
如何将拟合函数变成非线性的呢?
import numpy as np
def sigmoid(x):
return 1 / (1 + np.exp(-x))
test_x = np.linspace(-10, 10, 2000)
test_y = sigmoid(test_x)
plt.plot(test_x, test_y)
def random_linear(x):
k, b = np.random.normal(), np.random.normal()
return k * x + b * x
for _ in range(5):
plt.plot(sigmoid(random_linear(test_x)))
for _ in range(5):
plt.plot(random_linear(sigmoid(random_linear(test_x))))
for _ in range(5):
plt.plot(sigmoid(random_linear(sigmoid(random_linear(test_x)))))
def relu(x):
return x * (x > 0)
def so_many_layers(x, layers):
if len(layers) == 1: return layers[0](x)
return so_many_layers(layers[0](x), layers[1:])
layers = [random_linear, relu, random_linear, sigmoid, random_linear, sigmoid]
for _ in range(10):
plt.plot(so_many_layers(test_x, layers))
for _ in range(20):
plt.plot(relu(random_linear(relu(random_linear(test_x)))))
def price(x, k, b):
# Operation : CNN, RNN, LSTM, Attention 比KX+B更复杂的对应关系
return k*x + b
def linear(x, k1, b1):
return k1 * x + b1
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def y(x, k1, k2, b1, b2):
output1 = linear(x, k1, b1)
output2 = sigmoid(x)
output3 = linear(x, k2, b2)
return output3
trying_times = 50000
min_cost = float('inf')
losses = []
scala = 0.3
#k, b = random.random() * 100 - 200, random.random() * 100 - 200
参数初始化问题! Weight Initizalition 问题!
k1, b1 = np.random.normal(), np.random.normal()
k2, b2 = np.random.normal(), np.random.normal()
learning_rate = 1e-3 # Optimizer Rate
for i in range(trying_times):
price_by_random_k_and_b = [y(r, k1, k2, b1, b2) for r in X_rm]
cost = loss(list(y), price_by_random_k_and_b)
k_gradient = partial_k(X_rm, y, price_by_random_k_and_b) # 变化的方向
b_gradient = partial_b(y, price_by_random_k_and_b)
k1 += -1 * (partial_of_k1) * learning_rate
k2 += -1 * (partial_of_k2) * learning_rate
b1 += -1 * (partial_of_b1) * learning_rate
b2 += -1 * (partial_of_b2) * learning_rate
Review
σ
(
x
)
=
1
1
+
e
−
x
\sigma(x) = \frac{1}{1 + e^{-x}}
σ(x)=1+e−x1
l
o
s
s
(
y
,
y
^
)
=
1
n
∑
(
y
i
−
y
^
i
)
2
loss(y, \hat{y}) = \frac{1}{n} \sum{(y_i - \hat{y}_i)}^2
loss(y,y^)=n1∑(yi−y^i)2
y
^
=
k
2
∗
σ
(
k
1
∗
x
+
b
1
)
+
b
2
\hat{y} = k2 * \sigma(k1 * x + b1) + b2
y^=k2∗σ(k1∗x+b1)+b2
∂ l o s s ∂ k 1 = ∂ l o s s ∂ y ^ ∗ ∂ y ^ ∂ σ ∗ ∂ σ ∂ ( l i n e a r ) ∗ ∂ ( l i n e a r ) ∂ k 1 \frac{\partial{loss}}{\partial{k1}} = \frac{\partial{loss}}{\partial{\hat{y}}} * \frac{\partial{\hat{y}}}{\partial{\sigma}} * \frac{\partial{\sigma}}{\partial(linear)} * \frac{\partial{(linear)}}{\partial{k1}} ∂k1∂loss=∂y^∂loss∗∂σ∂y^∗∂(linear)∂σ∗∂k1∂(linear)
def partial_k(x, y, y_hat):
gradient = 0
for x_i, y_i, y_hat_i in zip(list(x), list(y), list(y_hat)):
gradient += (y_i - y_hat_i) * x_i
return -2 / len(y) * gradient
def partial_b(y, y_hat):
gradient = 0
for y_i, y_hat_i in zip(list(y), list(y_hat)):
gradient += (y_i - y_hat_i)
return -2 / len(y) * gradient
def loss(y, y_hat):
sum_ = sum([(y_i - y_hat_i) ** 2 for y_i, y_hat_i in zip(y, y_hat)])
return sum_ / len(y)
def loss_partial(y, y_hat):
return 2/len(y) * sum(y_i - y_hat_i for y_i, y_hat_i in zip(y, y_hat)) * -1
def linear(k, b, x):
return k * x + b
def linear_partial(k, b, x):
return k
def y_hat = linear() which x is sigmoid(x)
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def sigmoid_partial(x):
return sigmoid(x) (1 - sigmoid(x))
partials = [
loss_partial(y, y_hat),
linear_partial(k2, b2, sigmoid(x)),
sigmoid_partial(linear(k1, b1, x)),
linear_partial(k1, b1, x)
]
loss_partial_of_k1 = 1
for p in partials:
loss_partial_k1 *= p
如何更加方便?
computing_graph = {
'x1': ['linear'],
'k1': ['linear'],
'b1': ['linear'],
'linear': ['sigmoid'],
'sigmoid': ['linear_2'],
'k2': ['linear_2'],
'b2': ['linear_2'],
'linear_2': ['loss']
}
import networkx as nx
graph = nx.DiGraph(computing_graph)
layout = nx.layout.spring_layout(graph)
nx.draw(nx.DiGraph(computing_graph), layout, with_labels=True)
def visited_procedure(graph, postion, visited_order, step, sub_plot_index=None, colors=('red', 'green')):
changed = visited_order[:step] if step is not None else visited_order
before, after = colors
color_map = [after if c in changed else before for c in graph]
nx.draw(graph, postion, node_color=color_map, with_labels=True, ax=sub_plot_index)
visited_order = ['x1', 'b1', 'k1', 'linear', 'sigmoid', 'b2', 'k2','linear_2', 'loss']
Feed Forward
dimension = int(len(visited_order)**0.5)
fig, ax = plt.subplots(dimension, dimension+1, figsize=(15,15))
for i in range(len(visited_order)+1):
ix = np.unravel_index(i, ax.shape)
plt.sca(ax[ix])
ax[ix].title.set_text('Feed Forward Step: {}'.format(i))
visited_procedure(graph, layout, visited_order, step=i, sub_plot_index=ax[ix])
Backward
dimension = int(len(visited_order)**0.5)
fig, ax = plt.subplots(dimension, dimension+1, figsize=(15,15))
for i in range(len(visited_order)+1):
ix = np.unravel_index(i, ax.shape)
plt.sca(ax[ix])
ax[ix].title.set_text('Feed Forward Step: {}'.format(i))
visited_procedure(graph, layout, visited_order[::-1], step=i, sub_plot_index=ax[ix],
colors=('green', 'black'))
def toplogic(graph):
sorted_node = []
while len(graph) > 0:
all_inputs = []
all_outputs = []
for n in graph:
all_inputs += graph[n]
all_outputs.append(n)
all_inputs = set(all_inputs)
all_outputs = set(all_outputs)
need_remove = all_outputs - all_inputs # which in all_inputs but not in all_outputs
if len(need_remove) > 0:
node = random.choice(list(need_remove))
graph.pop(node)
sorted_node.append(node)
for _, links in graph.items():
if node in links: links.remove(node)
else:
break
return sorted_node
computing_graph
toplogic(computing_graph)
The time is Creating Framework
import numpy as np
class Node:
def __init__(self, inputs=[]):
self.inputs = inputs
self.outputs = []
for n in self.inputs:
n.outputs.append(self)
# set 'self' node as inbound_nodes's outbound_nodes
self.value = None
self.gradients = {}
# keys are the inputs to this node, and their
# values are the partials of this node with
# respect to that input.
# \partial{node}{input_i}
def forward(self):
'''
Forward propagation.
Compute the output value vased on 'inbound_nodes' and store the
result in self.value
'''
raise NotImplemented
def backward(self):
raise NotImplemented