参照别人的写法进行改进,数据标准化和数据集的切分,也完全手写,不使用sklearn的相关方法。
改进了下之前的代码,引用了sklearn中的数据集。
通过参数设置,来指定使用线性回归,还是二分类逻辑回归。
但是,不知道怎么写多分类的模型算法。。如可以,恳请高手指点下。。
#纯手写numpy实现回归和二分类模型
from sklearn import datasets
import numpy as np
def data_load():
if model_type == "linear":#线性回归模型
data = datasets.load_boston()
elif model_type =="sigmoid": #二分类模型
data = datasets.load_breast_cancer()
x = data.data
y = data.target
return x, y
def data_split(x, y, train_size = 0.8): #切分数据集
num = x.shape[0] #获取数据量
index = [i for i in range(num)] #获取虚拟的索引值,以便绑定x和y的对应关系
np.random.seed(10001) #获取随机种子,方便结果复现
np.random.shuffle(index)
x = x[index]
y = y[index]
train_num = int(num * train_size)
x_train = x[:train_num]
x_test = x[train_num:]
y_train = y[:train_num]
y_test = y[train_num :]
return x_train, x_test, y_train, y_test
def data_standscale(x): #标准化数据集
x_mean = np.mean(x)
x_std = np.std(x)
x_new = (x - x_mean) / x_std
return x_new
def data_cal(x, w, b):
y_ = np.dot(w, x.T) + b
if model_type == 'sigmoid':
y_ = 1 / (1 + np.exp(-y_))
return y_
def gradient_descent(x, y, w, b, learning_rate):
num = x.shape[0]
y_ = data_cal(x, w, b) #计算初步结果
if model_type == 'linear':
cost = np.sum((y_ - y) ** 2) / num #均方误差损失函数
elif model_type == 'sigmoid':
cost = - np.sum(y * np.log(y_) + (1 - y) * np.log(1 - y_)) / num #二分类交叉熵损失函数
g_w = np.dot(x.T, (y_ - y)) / num
g_b = np.sum(y_ - y) / num
w = w - g_w * learning_rate
b = b - g_b * learning_rate
return w, b, cost
def model_train(x, y, epochs, batch_size, learning_rate):
num = x.shape[0] #数据量
num_features = x.shape[1] #特征数
if x.ndim == 1:
num_features = 1
w = np.random.random(size=(num_features,)) #随机初始化w值
b = 0
for epoch in range(epochs): #迭代的轮数
batch_num = num // batch_size + 1 #根据数据情况,获得batch的数量,来确定每轮里还要更新几次权重
if num % batch_size == 0:
batch_num = num // batch_size
for batch in range(batch_num):
batch_x = x[batch * batch_size : (batch + 1) * batch_size]
batch_y = y[batch * batch_size : (batch + 1) * batch_size]
w, b, cost = gradient_descent(batch_x, batch_y, w, b, learning_rate)
if epoch % (epochs //10) == 0:
print('epoch', epoch, 'cost', cost)
return w, b
def main():
x, y = data_load() #加载数据集
x = data_standscale(x) #标准化处理数据
x_train, x_test, y_train, y_test = data_split(x, y) #切分数据集
w, b = model_train(x_train, y_train, epochs, batch_size, learning_rate)
prediction = data_cal(x_test, w, b) #根据迭代之后的权重值,获得预测结果
if model_type =='linear':
accuracy = [] #初始化准确度
for i, pred in enumerate(prediction):
accuracy.append(1- np.abs(pred - y_test[i]) / y_test[i])
print('test accuracy:', np.mean(accuracy))
elif model_type == "sigmoid":
num = 0 #初始化正确预测的个数
for i, pred in enumerate(prediction):
if pred >= 0.5:
if y_test[i] == 1:
num +=1
else:
if y_test[i] == 0:
num +=1
print('test accuracy', num / len(y_test))
if __name__ == '__main__':
epochs = int(1e5)
batch_size = 32
learning_rate = 1e-3
model_type = 'sigmoid'
main()