梯度下降演示
import random
def loss(k):
return 3 * (k ** 2) + 7 * k - 10
# -b / 2a = -7 / 6
def partial(k):
return 6 * k + 7
k = random.randint(-10, 10)
alpha = 1e-3 # 0.001
for i in range(1000):
k = k + (-1) * partial(k) *alpha
print(k, loss(k))
根据波士顿数据中的RM和LSTAT预测房价
RM:小区平均的卧室个数
LSTAT: 低收入人群在周围的比例
loss的意义,梯度下降的意义,stochastic gradient descent
import random
import numpy as np
import pandas as pd
from sklearn.datasets import load_boston
dataset = load_boston()
data = dataset['data']
target = dataset['target']
# print(target)
columns = dataset['feature_names']
# print(columns)
dataframe = pd.DataFrame(data)
dataframe.columns = columns
dataframe['price'] = target
# print(dataframe.head(10))
print(dataframe.corr()) # show the correlation of dataframe variables
# correlation => 1 如果一个值的增大,会引起另外一个值一定增大,而且是定比例增大 相关系数就越接近于1
# correlation => 0 就是两者之间没有任何关系
# correlation => -1 一个值增大 另外一个值一定减小 而且减小是成相等比例的
# sns.heatmap(dataframe.corr())
# plt.show()
rm = dataframe['RM']
lstat = dataframe['LSTAT']
price = dataframe['price']
# 房价的百分位,如下66代表房价价格从低到高,在66%的位置的价格
greater_then_most = np.percentile(price, 66)
print(greater_then_most)
# 加入标签价格在66分位以上的数据,大于66分位的为1,小于的为0
dataframe['expensive'] = dataframe['price'].apply(lambda p: int(p > greater_then_most))
target = dataframe['expensive']
print(dataframe[:20])
def linear(x, w, b):
# vectorized model 线性函数模型
return np.dot(x, w.T) + b
def loss(yhat, y):
# numpy broadcast numpy广播方法,求平均值
return np.mean((yhat - y) ** 2)
def partial_w(x, y, yhat):
"""对x的偏导"""
return np.array([2 * np.mean((yhat - y) * x[0]), 2 * np.mean((yhat - y) * x[1])])
def partial_b(x, y, yhat):
"""对b的偏导"""
return 2 * np.mean((yhat - y))
def optimize(w, b, x, y, yhat, pw, pb, learning_rate):
"""梯度下降"""
w = w + -1 * pw(x, y, yhat) * learning_rate
b = b + -1 * pb(x, y, yhat) * learning_rate
return w, b
def train(model_to_be_train, target, loss, pw, pb):
w = np.random.random_sample((1, 2)) # w normal
b = np.random.random()
learning_rate = 1e-5 # 学习率
epoch = 200
losses = []
for i in range(epoch):
batch_loss = []
for batch in range(len(rm)):
# batch training 选取部分数据去训练
index = random.choice(range(len(rm)))
rm_x, lstat_x = rm[index], lstat[index]
x = np.array([rm_x, lstat_x])
y = target[index]
yhat = model_to_be_train(x, w, b)
loss_v = loss(yhat, y)
batch_loss.append(loss_v)
w, b = optimize(w, b, x, y, yhat, pw, pb, learning_rate)
if batch % 100 == 0:
print('Epoch: {} Batch: {}, loss: {}'.format(i, batch, loss_v))
losses.append(np.mean(batch_loss))
return model_to_be_train, w, b, losses
if __name__ == "__main__":
import matplotlib.pyplot as plt
target = dataframe['price']
# 梯度下降训练获取最佳的w和b
model, w, b, losses = train(linear, target, loss, partial_w, partial_b)
plt.plot(losses)
# 使用求出的w和b,给一组x去预测
predicate = model(np.array([19, 7]), w, b)
print(predicate)
plt.show()
根据波士顿数据中的RM和LSTAT预测房价是否贵的分类问题
逻辑回归分类问题中
为什么y(预测值)接近于1的时候loss函数不定义成y = 1 - y(预测)?
为什么y(预测值)接近于0的时候loss函数不定义成y = y(预测)?
因为梯度变化太均匀
而使用y–>>1 的时候loss使用 y = -logy(预测),y–>>0的时候loss使用y = -log(1 - y(预测)),
是因为将输出实际上压在0~1之间,实际上可以较为有效的处理这些离群点的影响。当然这只是定性的解释
logistic regression 实际上就是需要去确定一组合适的θ \thetaθ使得上述计算出的结果同真实标签的差异最小化;
因此接下来就可以基于上述模型,定义出损失函数:
可参考:https://blog.csdn.net/xiaoxifei/article/details/106092353
from sklearn.datasets import load_boston
import pandas as pd
import seaborn as sns
import numpy as np
from 上一个文件中的 import train
from matplotlib import pyplot as plt
dataset = load_boston()
data = dataset['data']
target = dataset['target']
columns = dataset['feature_names']
dataframe = pd.DataFrame(data)
dataframe.columns = columns
dataframe['price'] = target
print(dataframe.corr()) # show the correlation of dataframe variables
# correlation => 1 如果一个值的增大,会引起另外一个值一定增大,而且是定比例增大 相关系数就越接近于1
# correlation => 0 就是两者之间没有任何关系
# correlation => -1 一个值增大 另外一个值一定减小 而且减小是成相等比例的
# sns.heatmap(dataframe.corr())
# plt.show()
# RM:小区平均的卧室个数
# LSTAT: 低收入人群在周围的比例
rm = dataframe['RM']
lstat = dataframe['LSTAT']
price = dataframe['price']
# 房价的百分位,如下66代表房价价格从低到高,在66%的位置的价格
greater_then_most = np.percentile(price, 66)
print(greater_then_most)
# 加入标签价格在66分位以上的数据,大于66分位的为1,小于的为0
dataframe['expensive'] = dataframe['price'].apply(lambda p: int(p > greater_then_most))
target = dataframe['expensive']
print(dataframe[:20])
def sigmoid(x):
"""逻辑回归曲线函数,范围(0, 1)且处处可导,处理分类问题"""
return 1 / (1 + np.exp(-x))
def model(x, w, b):
"""
:param x: [rm, lstat]
:param w: [w1, w2]
:param b:
"""
return sigmoid(np.dot(x, w.T) + b)
def loss(yhat, y):
return -np.sum(y*np.log(yhat) + (1 - y)*np.log(1 - yhat))
def partial_w(x, y, yhat):
return np.array([np.sum((yhat - y) * x[0]), np.sum((yhat - y) * x[1])])
def partial_b(x, y, yhat):
return np.sum((yhat - y))
# 训练模型
model, w, b, losses = train(model, target,loss, partial_w, partial_b)
# 测试数据
random_test_indices = np.random.choice(range(len(rm)), size=100)
decision_boundary = 0.5 # 阈值,大于就预测是1小于预测是0
# 进行预测
for i in random_test_indices:
x1, x2, y = rm[i], lstat[i], target[i]
predicate = model(np.array([x1, x2]), w, b)
predicate_label = int(predicate > decision_boundary)
print('RM: {}, LSTAT: {}, EXPENSIVE: {}, Predicated: {}'.format(x1, x2, y, predicate_label))