参考: https://www.cnblogs.com/hello-/articles/9737549.html
测试数据地址:https://download.csdn.net/download/yuanshuaipeng/12423234
案例说明:假设你是一个大学系的管理员,你想根据两次考试的结果来决定每个申请人的录取机会。你有以前的申请人的历史数据,你可以用它作为逻辑回归的训练集。对于每一个培训例子,你有两个考试的申请人的分数和录取决定。为了做到这一点,我们将建立一个分类模型,根据考试成绩估计入学概率。
导入三大模块:numpy pandas matplotlib
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
读取原始数据:
# 读取数据
data = pd.read_csv("./LogiReg_data.txt", header=None,
names=["score1", "score2", "label"])
print(data.head(3))
print(data.shape)
可视化展示:
def show_data(data):
set_0 = data[data["label"]==0]
set_1 = data[data["label"]==1]
plt.figure(figsize=(10,5))
plt.scatter(set_0["score1"], set_0["score2"], c="r", marker="*")
plt.scatter(set_1["score1"], set_1["score2"], c="b", marker="o")
plt.legend()
plt.xlabel("score1")
plt.ylabel("score2")
plt.show()
逻辑回归实现
目标:建立分类器(求解出三个参数 θ0θ1θ2),θ0偏置项θ1第一次测试的成绩θ2第二次测试的成绩
设定阈值,根据阈值判断录取结果
完成模块:
sigmoid : 映射到概率的函数
model : 返回预测结果值
cost : 根据参数计算损失
gradient : 计算每个参数的梯度方向
descent : 进行参数更新
1.sigmoid函数:
def sigmode(z):
return 1 / (1+ np.exp(-z))
2、预测函数:
def model(X, theta):
"""
X:样本矩阵 (包括第一列的1)
theta : 行向量 [θ0, θ1, ....θn]
返回预测的结果值
:return:
"""
return sigmode(np.dot(X, theta.T))
3、原始数据及参数初始化
# 数据增加1列,
data.insert(0,"Ones",value=1)
print(data.loc[:3,:])
# set X and y
orig_data =data.as_matrix()
# m 样本的数量, n 特征数量+1(label)
m,n = orig_data.shape
theta = np.zeros(shape=(1, n - 1))
4、损失函数
def cost(X, y, theta):
"""
获取样本的损失值,为所有样本的损失均值
"""
# 注意,multipy和 np.dot的区别
left = np.multiply(-y, np.log(model(X,theta)))
right = np.multiply(1-y, np.log(1 - model(X,theta)))
return np.sum(left-right) / len(X)
5、计算梯度:
def gradient(X,y,theta):
"""
计算梯度,返回一个和theta 一样的数组
:return:
"""
#二维数组,1*n
grad = np.zeros(shape=theta.shape)
# error n*1
error = y- model(X,theta)
for j in range(grad.shape[1]):
# grad 1*n ,
grad[0, j] = - np.dot(error.T, X[:,j]) / X.shape[0]
return grad
6、实现梯度下降:
6.1 设置三种迭代终止策略
1、迭代次数 2、损失差值 3、梯度差值
# 停止准则
STOP_ITER = 0 # 迭代数量
STOP_COST = 1 #损失函数
STOP_GRAD = 2 # 梯度值
def stop_cretion(type, value, threshold):
if type == STOP_ITER: return value > threshold
elif type == STOP_COST: return abs(value[-1]-value[-2]) < threshold
elif type == STOP_GRAD: return np.linalg.norm(value) < threshold
6.2 三种梯度下降策略: 1 整批量梯度下降、2、随机梯度下降 3 小批量梯度下降
根据batchsize 来实现三种梯度下降
# 重排数据
def shuffle_data(data):
np.random.shuffle(data)
X= data[:,:-1]
y = data[:,-1:]
return X,y
def descent(data, theta, batchsize, stoptype, threshold, alpha):
i=0 #迭代次数
b =0 #批量位置索引
init_time = time.time()
# 初始化梯度:
grad = np.zeros(theta.shape)
#首先对数据清洗
X, y = shuffle_data(data)
# 初始化cost
cost_value = [cost(X,y,theta)]
while True:
# 先求梯度:
grad = gradient(X[b:b+batchsize,:], y[b:b+batchsize,:], theta)
b += batchsize
#执行完一次后对数据重新洗牌
if b>= X.shape[0]:
b=0
X,y = shuffle_data(data)
# 根据梯度更新参数
theta = theta - alpha * grad
# 损失值
## 注意:尽管求梯度时不一定用到所有样本,但求损失值时,用的是所有样本
cost_value.append(cost(X,y,theta))
i+=1
if stoptype == STOP_ITER: value=i
elif stoptype == STOP_COST: value= cost_value
elif stoptype == STOP_GRAD: value= grad
#判断是否
if stop_cretion(stoptype, value, threshold): break
# 返回: theta ,迭代次数,损失值,当前梯度, 运行时间
return theta, i-1, cost_value, grad, time.time() - init_time
7、主函数及结果展示:
def run(data, theta, batchSize, stopType, thresh, alpha):
theta, iter, costs, grad, dur = descent(data, theta, batchSize, stopType, thresh, alpha)
# 根据结果绘制图形
name = "Original" if (data[:, 1] > 2).sum() > 1 else "Scaled"
name += " data - learning rate: {} - ".format(alpha)
if batchSize == data.shape[0]:
strDescType = "Gradient"
elif batchSize == 1:
strDescType = "Stochastic"
else:
strDescType = "Mini-batch ({})".format(batchSize)
name += strDescType + " descent - Stop: "
if stopType == STOP_ITER:
strStop = "{} iterations".format(thresh)
elif stopType == STOP_COST:
strStop = "costs change < {}".format(thresh)
else:
strStop = "gradient norm < {}".format(thresh)
name += strStop
print("***{}\nTheta: {} - Iter: {} - Last cost: {:03.2f} - Duration: {:03.2f}s".format(
name, theta, iter, costs[-1], dur))
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(np.arange(len(costs)), costs, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title(name.upper() + ' - Error vs. Iteration')
plt.show()
return theta
1、 基于批量梯度下降,终止准则5000次迭代
# 按迭代次数为终止条件: # run(orig_data, theta, m, STOP_ITER, 5000, 0.000001)
#按损失值的差值 # run(orig_data, theta, m, STOP_COST, thresh=0.000001, alpha=0.001) # 根据梯度变化停止 # run(orig_data, theta, m, STOP_GRAD, thresh=0.05, alpha=0.001)
#标准化后的实验结果 orig_data[:,:(n-1)] = scale(orig_data[:,:(n-1)]) # run(orig_data, theta, m, STOP_ITER, 50000, 0.001) theta = run(orig_data, theta, 16, STOP_ITER, 50000, 0.001)
全部代码:
#coding=utf-8
import sys
import math
import random
import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import scale
# 停止准则
STOP_ITER = 0 # 迭代数量
STOP_COST = 1 #损失函数
STOP_GRAD = 2 # 梯度值
def show_data(data):
set_0 = data[data["label"]==0]
set_1 = data[data["label"]==1]
plt.figure(figsize=(10,5))
plt.scatter(set_0["score1"], set_0["score2"], c="r", marker="*")
plt.scatter(set_1["score1"], set_1["score2"], c="b", marker="o")
plt.xlabel("score1")
plt.ylabel("score2")
plt.show()
def sigmode(z):
return 1 / (1+ np.exp(-z))
def model(X, theta):
"""
X:样本矩阵 (包括第一列的1)
theta : 行向量 [θ0, θ1, ....θn]
返回预测的结果值
:return:
"""
return sigmode(np.dot(X, theta.T))
def cost(X, y, theta):
"""
获取样本的损失值,为所有样本的损失均值
"""
# 注意,multipy和 np.dot的区别
left = np.multiply(-y, np.log(model(X,theta)))
right = np.multiply(1-y, np.log(1 - model(X,theta)))
return np.sum(left-right) / len(X)
def gradient(X,y,theta):
"""
计算梯度,返回一个和theta 一样的数组
:return:
"""
#二维数组,1*n
grad = np.zeros(shape=theta.shape)
# error n*1
error = y- model(X,theta)
for j in range(grad.shape[1]):
# grad 1*n ,
grad[0, j] = - np.dot(error.T, X[:,j]) / X.shape[0]
return grad
def gradient2(X, y, theta):
grad = np.zeros(theta.shape)
error = (model(X, theta) - y).ravel()
for j in range(len(theta.ravel())): # for each parmeter
term = np.multiply(error, X[:, j])
grad[0, j] = np.sum(term) / len(X)
return grad
def stop_cretion(type, value, threshold):
if type == STOP_ITER: return value > threshold
elif type == STOP_COST: return abs(value[-1]-value[-2]) < threshold
#
elif type == STOP_GRAD: return np.linalg.norm(value) < threshold
# 重排数据
def shuffle_data(data):
np.random.shuffle(data)
X= data[:,:-1]
y = data[:,-1:]
return X,y
def descent(data, theta, batchsize, stoptype, threshold, alpha):
i=0 #迭代次数
b =0 #批量位置索引
init_time = time.time()
# 初始化梯度:
grad = np.zeros(theta.shape)
#首先对数据清洗
X, y = shuffle_data(data)
# 初始化cost
cost_value = [cost(X,y,theta)]
while True:
# 先求梯度:
grad = gradient(X[b:b+batchsize,:], y[b:b+batchsize,:], theta)
b += batchsize
#执行完一次后对数据重新洗牌
if b>= X.shape[0]:
b=0
X,y = shuffle_data(data)
# 根据梯度更新参数
theta = theta - alpha * grad
# 损失值
## 注意:尽管求梯度时不一定用到所有样本,但求损失值时,用的是所有样本
cost_value.append(cost(X,y,theta))
i+=1
if stoptype == STOP_ITER: value=i
elif stoptype == STOP_COST: value= cost_value
elif stoptype == STOP_GRAD: value= grad
#判断是否
if stop_cretion(stoptype, value, threshold): break
# 返回: theta ,迭代次数,损失值,当前梯度, 运行时间
return theta, i-1, cost_value, grad, time.time() - init_time
def run(data, theta, batchSize, stopType, thresh, alpha):
theta, iter, costs, grad, dur = descent(data, theta, batchSize, stopType, thresh, alpha)
# 根据结果绘制图形
name = "Original" if (data[:, 1] > 2).sum() > 1 else "Scaled"
name += " data - learning rate: {} - ".format(alpha)
if batchSize == data.shape[0]:
strDescType = "Gradient"
elif batchSize == 1:
strDescType = "Stochastic"
else:
strDescType = "Mini-batch ({})".format(batchSize)
name += strDescType + " descent - Stop: "
if stopType == STOP_ITER:
strStop = "{} iterations".format(thresh)
elif stopType == STOP_COST:
strStop = "costs change < {}".format(thresh)
else:
strStop = "gradient norm < {}".format(thresh)
name += strStop
print("***{}\nTheta: {} - Iter: {} - Last cost: {:03.2f} - Duration: {:03.2f}s".format(
name, theta, iter, costs[-1], dur))
fig, ax = plt.subplots(figsize=(12, 4))
ax.plot(np.arange(len(costs)), costs, 'r')
ax.set_xlabel('Iterations')
ax.set_ylabel('Cost')
ax.set_title(name.upper() + ' - Error vs. Iteration')
plt.show()
return theta
def predict(X, theta):
return [1 if x> 0.5 else 0 for x in model(X,theta)]
def acuuracy(y,predict_value):
_y =y.ravel()
correct = [1 if _y[i] == predict_value[i] else 0
for i in range(len(_y))]
return np.sum(correct) / len(_y)
if __name__ == '__main__':
# 读取数据
data = pd.read_csv("./LogiReg_data.txt", header=None, names=["score1", "score2", "label"])
print(data.head(3))
print(data.shape)
# show_data(data)
# 数据增加1列,
data.insert(0,"Ones",value=1)
print(data.loc[:3,:])
# set X and y
orig_data =data.as_matrix()
# m 样本的数量, n 特征数量+1(label)
m,n = orig_data.shape
theta = np.zeros(shape=(1, n - 1))
#批量梯度下降
# 按迭代次数为终止条件:
# run(orig_data, theta, m, STOP_ITER, 5000, 0.000001)
#按损失值的差值
# run(orig_data, theta, m, STOP_COST, thresh=0.000001, alpha=0.001)
# 根据梯度变化停止
# run(orig_data, theta, m, STOP_GRAD, thresh=0.05, alpha=0.001)
#标准化
orig_data[:,:(n-1)] = scale(orig_data[:,:(n-1)])
# run(orig_data, theta, m, STOP_ITER, 50000, 0.001)
theta = run(orig_data, theta, 16, STOP_ITER, 50000, 0.001)
#计算准确率
print("准确率为:", acuuracy(orig_data[:,-1], predict(orig_data[:,:(n-1)], theta)))