纯numpy写线性回归、及二分类模型

最新推荐文章于 2024-04-04 19:20:48 发布

lmw0320

最新推荐文章于 2024-04-04 19:20:48 发布

阅读量1.1k

点赞数 1

分类专栏：机器学习基础文章标签：机器学习

本文链接：https://blog.csdn.net/lmw0320/article/details/106923552

版权

机器学习基础专栏收录该内容

2 篇文章 0 订阅

订阅专栏

参照别人的写法进行改进，数据标准化和数据集的切分，也完全手写，不使用sklearn的相关方法。
改进了下之前的代码，引用了sklearn中的数据集。
通过参数设置，来指定使用线性回归，还是二分类逻辑回归。
但是，不知道怎么写多分类的模型算法。。如可以，恳请高手指点下。。

#纯手写numpy实现回归和二分类模型
from sklearn import  datasets
import numpy as np

def data_load():
    if model_type == "linear":#线性回归模型
        data = datasets.load_boston()
    elif model_type =="sigmoid": #二分类模型
        data = datasets.load_breast_cancer()
    x = data.data
    y = data.target
    return x, y

def data_split(x, y, train_size = 0.8): #切分数据集
    num = x.shape[0] #获取数据量
    index = [i for i in range(num)] #获取虚拟的索引值，以便绑定x和y的对应关系
    np.random.seed(10001) #获取随机种子，方便结果复现
    np.random.shuffle(index)
    x = x[index]
    y = y[index]
    train_num = int(num * train_size)
    x_train = x[:train_num]
    x_test = x[train_num:]
    y_train = y[:train_num]
    y_test = y[train_num :]
    
    return x_train, x_test, y_train, y_test

def data_standscale(x): #标准化数据集
    x_mean = np.mean(x)
    x_std = np.std(x)
    x_new = (x - x_mean) / x_std
    return x_new

def data_cal(x, w, b):
    y_ = np.dot(w, x.T) + b
    if model_type == 'sigmoid':
        y_ = 1 / (1 + np.exp(-y_))
    
    return y_

def gradient_descent(x, y, w, b, learning_rate):
    num = x.shape[0]
    y_ = data_cal(x, w, b) #计算初步结果
    if model_type == 'linear':
        cost = np.sum((y_ - y) ** 2) / num #均方误差损失函数
    elif model_type == 'sigmoid':
        cost = - np.sum(y * np.log(y_) + (1 - y) * np.log(1 - y_)) / num #二分类交叉熵损失函数
    
    g_w = np.dot(x.T, (y_ - y)) / num
    g_b = np.sum(y_ - y) / num
    
    w = w - g_w * learning_rate
    b = b - g_b * learning_rate
    
    return w, b, cost

def model_train(x, y, epochs, batch_size, learning_rate):
    num = x.shape[0] #数据量
    num_features = x.shape[1] #特征数
    if x.ndim == 1:
        num_features = 1
    w = np.random.random(size=(num_features,)) #随机初始化w值
    b = 0
    for epoch in range(epochs): #迭代的轮数
        batch_num = num // batch_size + 1 #根据数据情况，获得batch的数量，来确定每轮里还要更新几次权重
        if num % batch_size == 0:
            batch_num = num // batch_size
        for batch in range(batch_num):
            batch_x = x[batch * batch_size : (batch + 1) * batch_size]
            batch_y = y[batch * batch_size : (batch + 1) * batch_size]
            w, b, cost = gradient_descent(batch_x, batch_y, w, b, learning_rate)
        if epoch % (epochs //10) == 0:
            print('epoch', epoch, 'cost', cost)
    return w, b

def main():
    x, y = data_load() #加载数据集
    x = data_standscale(x) #标准化处理数据
    x_train, x_test, y_train, y_test = data_split(x, y) #切分数据集
    w, b = model_train(x_train, y_train, epochs, batch_size, learning_rate)
    
    prediction = data_cal(x_test, w, b) #根据迭代之后的权重值，获得预测结果
    if model_type =='linear':
        accuracy = [] #初始化准确度
        for i, pred in enumerate(prediction):
            accuracy.append(1- np.abs(pred - y_test[i]) / y_test[i])
        print('test accuracy:', np.mean(accuracy))
    elif model_type == "sigmoid":
        num = 0 #初始化正确预测的个数
        for i, pred in enumerate(prediction):
            if pred >= 0.5:
                if y_test[i] == 1:
                    num +=1
            else:
                if y_test[i] == 0:
                    num +=1
        print('test accuracy', num / len(y_test))
if __name__ == '__main__':
    epochs = int(1e5)
    batch_size = 32
    learning_rate = 1e-3
    model_type = 'sigmoid'
    main()