逻辑回归-二分类
数据集是自己生成的,训练集8000个,每个样本10个特征和1个标签,测试集2000个,每个样本10个特征和1个标签
读取数据
# coding: utf-8
import numpy as np
#获取训练数据
def get_data():
d = np.load("dataset.npz")
#x_train,y_train分别表示8000个训练集的数据和标签
# x_test,y_test分别表示2000个测试集的数据和标签
x_train = d['x_train'] #8000*10,每个数据10个特征值
y_train = d['y_train'] #8000*1
x_test = d['x_test'] #2000*10
y_test = d['y_test'] #2000*1
# w = d['w']
# b = d['b']
return x_train,y_train,x_test,y_test
if __name__ == '__main__':
x_train, y_train, x_test, y_test, w, b = get_data()
print(x_train,y_train,x_test,y_test,w,b)
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape ,w.shape,b)
训练数据保存模型
#coding:utf-8
'''
1、取数据
2、随机初始化参数w和b,确定学习率alpha,确定循环次数epochs
3、根据逻辑回归计算预测值Y_
4、计算成本函数J
5、计算机J对w和b的偏导(梯度值)分别为:dw和db,中间变量是dz
6、更新w和b的值:w = w-alpha * dw b = b-alpha * db
7、循环第3-6步的操作,共epochs次
'''
import numpy as np
from data import get_data
def sigmoid(z):
return 1/(1+np.exp(-z))
def loss(y,y_):
return -(y*np.log(y_)+(1-y)*np.log(1-y_))
if __name__ == '__main__':
x_train, y_train, x_test, y_test, w_data, b_data = get_data()
#参数W和b先随机初始化
W = np.random.rand(10)
b = np.random.rand()
alpha = 0.5
epochs = 10000
for i in range(epochs):
z = np.sum(W*x_train, axis = -1)+b
y_ = sigmoid(z)
J = np.mean(loss(y_train,y_))
dz = y_-y_train
dw = np.mean((x_train.transpose()*dz).transpose(),axis=0)
db = np.mean(dz)
W = W-alpha*dw
b = b-alpha*db
print('第%d个epoch: 成本函数值J=%f,dw的平均值为%f,db的值为%f'%(i,J,np.mean(dw),db ))
np.savez('model.npz',W=W,b=b)
用测试集测试
#coding:utf-8
'''
1、按照保存的w和b计算测试数据x_test的预测值y_,中间变量是线性回归值z
2、处理预测值,判断是否大于0.5,再取整后保存为Y_
3、比较预测值y_和真实值y的相同结果的个数
'''
import numpy as np
from data import get_data
from logistic import sigmoid
if __name__ == '__main__':
x_train, y_train, x_test, y_test = get_data()
d = np.load('model.npz')
w = d['W']
b = d['b']
z = np.sum(w * x_test, axis=-1) + b
y_ = sigmoid(z)
y_pred_float = (y_>0.5).astype(np.float64)
accuracy = np.sum((y_pred_float==y_test).astype(np.int))/y_test.shape[0]
print('accuracy:',accuracy)
结果