NumPy实现logistic回归

Singcing

已于 2024-09-01 15:15:10 修改

阅读量302

点赞数 1

分类专栏：机器学习文章标签： numpy

于 2024-09-01 15:01:27 首次发布

本文链接：https://blog.csdn.net/m0_46306264/article/details/141781938

版权

机器学习专栏收录该内容

17 篇文章 0 订阅

订阅专栏

1.sklearn实现

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  r2_score,confusion_matrix

current_dir=os.getcwd()
path = current_dir +"\\" +"BreastCancer.csv";

cancer = pd.read_csv(path)

cancer = cancer.drop(["id","Unnamed: 32"], axis=1)  # axis = 1, drop a column, axis = 0, drop a row
cancer['diagnosis'] = [ 1 if i == "M" else 0 for i in cancer['diagnosis'] ]

y = cancer['diagnosis']                       # (569,)
x = cancer.drop(["diagnosis"], axis=1) # (569, 30)

# (455, 30) (114, 30) (455,) (114,)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# 标准化处理，每一列求mean和std，然后X_scaled = (x-mean)/std
# 让每一列的数据，均值为0，方差为1. 否则会内存溢出
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

logreg = LogisticRegression(max_iter = 1000)
logreg.fit(x_train_scaled, y_train)

train_accuracy = logreg.score(x_train_scaled, y_train)  # 0.989010989010989
test_accuracy = logreg.score(x_test_scaled, y_test)     # 0.9649122807017544
print("Training Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

y_pred = logreg.predict(x_test_scaled)
r2 = r2_score(y_test,y_pred)         # Test R2: 0.8551921244839632
print("Test R2:", r2)

print ( confusion_matrix(y_test,y_pred) )
"""
[[65  2]
 [ 2 45]]
"""

2. NumPy实现

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.metrics import confusion_matrix

class Logistic:
    def __init__(self,learning_rate = 0.01, epochs = 1000, verbose= False):
        self.lr = learning_rate
        self.epochs = epochs
        self.verbose = verbose
        self.weight = None
        self.bias = None
    def sigmoid(self,x):
        z = 1 / (1 + np.exp(-x))
        return z
    def initialise_weights(self,n):
        self.weight = np.zeros(n)
        self.bias = 0
    def predict_prob(self,X):
        linear_result = X @ self.weight + self.bias
        y_pred = self.sigmoid(linear_result)
        return y_pred
    def fit(self,X,y):
        m , n = X.shape
        self.initialise_weights(n)

        for epoch in range(self.epochs):
            y_pred = self.predict_prob(X)

            error = (y_pred - y)
            dcost_dw = (1/m) * ( X.T @ error )
            dcost_db = (1/m) * np.sum( error )

            self.weight -= self.lr * dcost_dw
            self.bias -= self.lr * dcost_db

            if self.verbose and epoch % 100 == 99:
                loss = (-1 / m) *np.sum(y*np.log(y_pred) + (1-y) * np.log(1-y_pred))
                print(f"epoch:{epoch} loss:{loss}")
    def predict(self,X):
        y_pred = self.predict_prob(X)
        y_pred = np.where(y_pred>=0.5,1,0)
        return y_pred

current_dir=os.getcwd()
path = current_dir +"\\" +"BreastCancer.csv";

cancer = pd.read_csv(path)

cancer = cancer.drop(["id","Unnamed: 32"], axis=1)  # axis = 1, drop a column, axis = 0, drop a row
cancer['diagnosis'] = [ 1 if i == "M" else 0 for i in cancer['diagnosis'] ]

print(cancer.shape) # (569, 31)

y = cancer['diagnosis']                       # (569,)
x = cancer.drop(["diagnosis"], axis=1) # (569, 30)

# (455, 30) (114, 30) (455,) (114,),类型是DataFrame
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
x_train,x_test,y_train,y_test = np.array(x_train),np.array(x_test),np.array(y_train),np.array(y_test) # 转为ndarray矩阵格式

scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

logistic = Logistic(learning_rate = 0.01, epochs = 1000, verbose = True)
logistic.fit(x_train_scaled,y_train)

y_pred = logistic.predict(x_test_scaled)

r2 = r2_score(y_test,y_pred)
print(r2)

print ( confusion_matrix(y_test,y_pred) )
"""
[[65  2]
 [ 2 45]]
"""