机器学习——逻辑回归代码实现

Nikko6688

已于 2024-08-06 00:17:11 修改

阅读量477

点赞数 8

文章标签：机器学习逻辑回归人工智能回归 python 算法 pycharm

于 2024-08-05 23:53:25 首次发布

本文链接：https://blog.csdn.net/weixin_57205312/article/details/140939589

版权

机器学习——逻辑回归代码实现

此文章介绍机器学习逻辑回归的代码实现，其中包括：线性决策边界（logistic_regression_with_linear_boundary）以及非线性决策边界（NonLinearBoundary）。

首先最重要的utils工具包和线性回归中的一样，直接导入项目即可。

LogisticRegression类

然后创建LogisticRegression类，其中包括初始化init函数，训练函数（train）,梯度下降函数（gradient_descent），用于计算损失的函数（cost_function），以及用于计算给定数据和权重参数（theta）的预测值的函数，以及计算梯度的函数（gradient_step），还有预测函数（predict）。

LogisticRegression.py

# 逻辑回归是经典的二分类算法，也可以做多分类
import numpy as np
from scipy.optimize import minimize
from utils.features import prepare_for_training
from utils.hypothesis import sigmoid

class LogisticRegression:
    def __init__(self, data, labels, polynomial_degree = 0, sinusoid_degree = 0, normalize_data = False):
        """
                1.对数据进行预处理操作
                2.先得到所有的特征个数
                3.初始化参数矩阵
                """
        # 数据预处理
        (data_processed,
         features_mean,
         features_deviation) = prepare_for_training(data, polynomial_degree, sinusoid_degree, normalize_data=False)

        # 使用 data_processed 替代 data，确保使用处理后的数据，特征数量正确
        self.data = data_processed
        self.labels = labels
        self.unique_labels = np.unique(labels)
        self.features_mean = features_mean
        self.features_deviation = features_deviation
        self.polynomial_degree = polynomial_degree
        self.sinusoid_degree = sinusoid_degree
        self.normalize_data = normalize_data

        # shape[0]表示多少个特征，shape[1]表示特征的个数
        num_features = self.data.shape[1]
        num_unique_labels = np.unique(labels).shape[0]
        # theta的第一个维度(num_unique_labels)表示要做多少类别，
        self.theta = np.zeros((num_unique_labels, num_features))

    # 训练函数
    def train(self,max_iterations = 1000):
        # 记录损失值
        cost_histories = []
        num_features = self.data.shape[1]
        for label_index, unique_label in enumerate(self.unique_labels):
            current_initial_theta = np.copy(self.theta[label_index].reshape(num_features,1))
            current_labels = (self.labels == unique_label).astype(float)
            # 然后进行梯度下降
            (current_theta, cost_history) = LogisticRegression.gradient_descent(self.data,current_labels,current_initial_theta,max_iterations)
            self.theta[label_index] = current_theta.T
            cost_histories.append(cost_history)
        return self.theta,cost_histories

    @staticmethod
    def gradient_descent(data,labels,current_initial_theta,max_iterations):
        # 梯度下降函数
        cost_history = []
        num_features = data.shape[1]
        # 使用scipy中的minimize方法继续优化
        result = minimize(
            # 要优化的目标：
            lambda current_theta: LogisticRegression.cost_function(data,labels,current_theta.reshape(num_features,1)),
            # 初始化的权重参数
            current_initial_theta.flatten(),
            # 选择优化策略
            method='CG',
            # 梯度下降迭代计算公式
            jac = lambda current_theta:LogisticRegression.gradient_step(data,labels,current_theta.reshape(num_features,1)),
            # 记录结果
            callback= lambda current_theta: cost_history.append(LogisticRegression.cost_function(data,labels,current_theta.reshape(num_features,1))),
            # 迭代次数
            options = {'maxiter':max_iterations}
        )
        if not result.success:
            raise ArithmeticError('Can not minimize cost function'+result.message)
        optmized_theta = result.x.reshape(num_features,1)
        return optmized_theta,cost_history

    @staticmethod
    def cost_function(data,labels,theta):
        # 首先计算当前总共的数据量
        num_examples = data.shape[0]
        predictions =   LogisticRegression.hypothesis(data,theta)
        # y_is_set_cost表示属于当前样本的总的损失
        y_is_set_cost = np.dot(labels[labels == 1].T,np.log(predictions[labels == 1]))
        y_is_not_set_cost = np.dot(1-labels[labels == 0].T,np.log(1-predictions[labels == 0]))
        cost = (-1/num_examples)*(y_is_set_cost+y_is_not_set_cost)
        return cost

    @staticmethod
    def hypothesis(data,theta):
        predictions = sigmoid(np.dot(data,theta))
        return predictions

    @staticmethod
    def gradient_step(data,labels,theta):
        num_examples = labels.shape[0]
        predictions = LogisticRegression.hypothesis(data,theta)
        label_diff = predictions - labels
        # 计算梯度
        gradients = (1/num_examples) * np.dot(data.T,label_diff)
        return gradients.T.flatten()    # 拉长行向量，变成一行

    def predict(self,data):
        num_examples = data.shape[0]
        data_processed = prepare_for_training(data, self.polynomial_degree, self.sinusoid_degree, self.normalize_data)[0]
        prob = LogisticRegression.hypothesis(data_processed,self.theta.T)
        # 计算一下最大的概率值
        max_prob_index = np.argmax(prob,axis=1)
        class_prediction = np.empty(max_prob_index.shape,dtype=object)
        for index,label in enumerate(self.unique_labels):
            class_prediction[max_prob_index == index] = label
        return class_prediction.reshape((num_examples,1))

线性决策边界

线性决策边界基于iris.csv数据集进行训练与测试。

首先进行线性决策边界的绘制：即logistic_regression_with_linear_boundary

logistic_regression_with_linear_boundary.py

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from logistic_regression import LogisticRegression

data = pd.read_csv('../data/iris.csv')
iris_types = ['SETOSA','VERSICOLOR','VIRGINICA']

x_axis = 'petal_length'
y_axis = 'petal_width'

for iris_type in iris_types:
    plt.scatter(data[x_axis][data['class'] == iris_type],
                data[y_axis][data['class'] == iris_type],
                label = iris_type
                )   

plt.show()

# 计算样本数量
num_examples = data.shape[0]
# 对数据进行训练,先用.values转换成ndarray格式
x_train = data[[x_axis,y_axis]].values.reshape((num_examples,2))
y_train = data['class'].values.reshape((num_examples,1))


max_iterations = 1000
polynomial_degree = 0
sinusoid_degree = 0

# 实例化
logistic_regression = LogisticRegression(x_train,y_train,polynomial_degree,sinusoid_degree)
thetas,cost_histories = logistic_regression.train(max_iterations)
labels = logistic_regression.unique_labels


plt.plot(range(len(cost_histories[0])),cost_histories[0],label=labels[0])
plt.plot(range(len(cost_histories[1])),cost_histories[1],label=labels[1])
plt.plot(range(len(cost_histories[2])),cost_histories[2],label=labels[2])
plt.show()

y_train_predictions = logistic_regression.predict(x_train)
precision = np.sum(y_train_predictions == y_train)/y_train.shape[0] * 100
print('precision:',precision)

x_min = np.min(x_train[:,0])
x_max = np.max(x_train[:,0])
y_min = np.min(x_train[:,1])
y_max = np.max(x_train[:,1])
samples = 150
X = np.linspace(x_min,x_max,samples)
Y = np.linspace(y_min,y_max,samples)

# 初始化
Z_SETOSA = np.zeros((samples,samples))
Z_VERSICOLOR = np.zeros((samples,samples))
Z_VIRGINICA = np.zeros((samples,samples))

for x_index,x in enumerate(X):
    for y_index,y in enumerate(Y):
        data = np.array([[x,y]])
        prediction = logistic_regression.predict(data)[0][0]
        if prediction == 'SETOSA':
            Z_SETOSA[x_index][y_index] = 1
        elif prediction == 'VERSICOLOR':
            Z_VERSICOLOR[x_index][y_index] = 1
        elif prediction == 'SETOSA':
            Z_VIRGINICA[x_index][y_index] = 1

for iris_type in iris_types:
    plt.scatter(
        x_train[(y_train == iris_type).flatten(),0],
        x_train[(y_train == iris_type).flatten(),1],
        label=iris_type
        )

plt.contour(X,Y,Z_SETOSA)
plt.contour(X,Y,Z_VERSICOLOR)
plt.contour(X,Y,Z_VIRGINICA)
plt.show()

结果展示：

实质就是运用三次二分类的方法进行便捷的绘制，首先蓝色==1，另外两个颜色==0，将蓝色于其他颜色的边界绘出，然后对橙色进行二分类，即橙色==1，其他颜色==0，将橙色与另外两个颜色的边界绘出，再对绿色运用二分类的思想，即绿色==1，其他颜色==0，将绿色与其他颜色的边界绘出。（图中感觉是两条线，其实是四条线，基于蓝色绘制一条，橙色绘制两条，绿色绘制一条。具体观察的话可将最后的contour图逐一展现即可观察。）

非线性决策边界

非线性决策边界的绘制采用microchips-tests.csv数据集。

NonLinearBoundary.py

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math

from logistic_regression import LogisticRegression

data = pd.read_csv('../data/microchips-tests.csv')

# 类别标签
validities = [0,1]

# 选择两个特征
x_axis = 'param_1'
y_axis = 'param_2'

# 散点图
for validity in validities:
    plt.scatter(
        data[x_axis][data['validity'] == validity],
        data[y_axis][data['validity'] == validity],
        label = validity
    )

plt.xlabel(x_axis)
plt.ylabel(y_axis)
plt.title('Microchips Tests')
plt.legend()
plt.show()

num_examples = data.shape[0]
x_train = data[[x_axis,y_axis]].values.reshape((num_examples,2))
y_train = data['validity'].values.reshape((num_examples,1))

# 训练参数
max_iterations = 100000
regularization_param = 0
polynomial_degree = 5
sinusoid_degree = 0

# 逻辑回归
logistic_regression = LogisticRegression(x_train,y_train,polynomial_degree,sinusoid_degree)

# 训练
(thetas,costs) = logistic_regression.train(max_iterations)

columns = []
for theta_index in range(0,thetas.shape[1]):
    columns.append('Theta'+str(theta_index))

# 训练结果
labels = logistic_regression.unique_labels

plt.plot(range(len(costs[0])),costs[0],label = labels[0])
plt.plot(range(len(costs[1])),costs[1],label = labels[1])

plt.xlabel('Gradient Steps')
plt.ylabel('Cost')
plt.legend()
plt.show()

# 预测
y_train_predictions = logistic_regression.predict(x_train)

# 准确率
precision = np.sum(y_train_predictions == y_train) / y_train.shape[0] * 100
print('Training Precision: {:5.4f}%'.format(precision))

num_examples = x_train.shape[0]
samples = 150
x_min = np.min(x_train[:,0])
x_max = np.max(x_train[:,0])

y_min = np.min(x_train[:,1])
y_max = np.max(x_train[:,1])
X = np.linspace(x_min,x_max,samples)
Y = np.linspace(y_min,y_max,samples)
Z = np.zeros((samples, samples))

# 结果展示
for x_index,x in enumerate(X):
    for y_index,y in enumerate(Y):
        data = np.array([[x,y]])
        Z[x_index][y_index] = logistic_regression.predict(data)[0][0]

positives = (y_train == 1).flatten()
negatives = (y_train == 0).flatten()

plt.scatter(x_train[negatives,0], x_train[negatives,1],label = '0')
plt.scatter(x_train[positives,0], x_train[positives,1],label = '1')

plt.contour(X,Y,Z)
plt.show()

结果展示：

图一即为原始数据点的分布情况。图二表示随着梯度下降的进行，损失之cost也在逐渐减少。

图三即为基于原始数据将此非线性决策边界绘出的结果。

Nikko6688

关注

8
点赞
踩
5

收藏

觉得还不错? 一键收藏
0
评论
机器学习——逻辑回归代码实现

全文介绍机器学习——逻辑回归的代码实现，包括：线性决策边界（logistic_regression_with_linear_boundary）以及非线性决策边界（NonLinearBoundary）
复制链接

扫一扫